def __init__(self, upstream_url_resolver, proxy_magic='pywb.proxy', magic_fwd='http://localhost/', assets_path=None, is_rw=True): self.upstream_url_resolver = upstream_url_resolver self.loader = ArcWarcRecordLoader() self.proxy_magic = proxy_magic self.fwd_scheme, self.fwd_host, self.fwd_port, self.fwd_path = parse( magic_fwd) self.fwd_scheme = self.fwd_scheme.decode('latin-1') self.fwd_host = self.fwd_host.decode('latin-1') self.fwd_path = self.fwd_path.decode('latin-1') self.jinja_env = JinjaEnv(assets_path=assets_path) self.head_insert_view = HeadInsertView(self.jinja_env, 'head_insert.html', 'banner.html') self.error_view = BaseInsertView(self.jinja_env, 'error.html') self.home_redir_view = BaseInsertView(self.jinja_env, 'home.html') if is_rw: self.content_rewriter = Rewriter(is_framed_replay=False) else: self.content_rewriter = None
def __init__(self, fileobj, no_record_parse=False, verify_http=False, arc2warc=False, ensure_http_headers=False, block_size=BUFF_SIZE): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http, arc2warc=arc2warc) self.known_format = None self.mixed_arc_warc = arc2warc self.member_info = None self.no_record_parse = no_record_parse self.ensure_http_headers = ensure_http_headers self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None self.err_count = 0 self.record = None self.the_iter = self._iterate_records()
def get_rendered_original_stream(warc_filename, warc_offset, compressedendoffset, payload_only=True): """ Grabs a resource. """ # If not found, say so: if warc_filename is None: return None, None # Grab the payload from the WARC and return it. url = "%s%s?op=OPEN&user.name=%s&offset=%s" % ( WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset) if compressedendoffset and int(compressedendoffset) > 0: url = "%s&length=%s" % (url, compressedendoffset) r = requests.get(url, stream=True) # We handle decoding etc. r.raw.decode_content = False logger.debug("Loading from: %s" % r.url) logger.debug("Got status code %s" % r.status_code) # Return the payload, or the record: if payload_only: # Parse the WARC, return the payload: rl = ArcWarcRecordLoader() record = rl.parse_record_stream( DecompressingBufferedReader(stream=r.raw)) #return record.raw_stream, record.content_type return record.content_stream(), record.content_type else: # This makes sure we only get the first GZip chunk: s = DecompressingBufferedReader(stream=r.raw) warc_record = s.read() return warc_record, 'application/warc'
def __init__(self, framed_replay=False, jinja_env=None, config=None, paths=None): self.loader = ArcWarcRecordLoader() self.config = config or {} self.paths = paths or {} self.framed_replay = framed_replay if framed_replay: self.frame_mod = '' self.replay_mod = 'mp_' else: self.frame_mod = None self.replay_mod = '' self.default_rw = DefaultRewriter(replay_mod=self.replay_mod, config=config) self.js_proxy_rw = RewriterWithJSProxy(replay_mod=self.replay_mod) if not jinja_env: jinja_env = JinjaEnv(globals={'static_path': 'static'}) self.jinja_env = jinja_env self.redirect_to_exact = config.get('redirect_to_exact') self.banner_view = BaseInsertView(self.jinja_env, self._html_templ('banner_html')) self.head_insert_view = HeadInsertView(self.jinja_env, self._html_templ('head_insert_html'), self.banner_view) self.frame_insert_view = TopFrameView(self.jinja_env, self._html_templ('frame_insert_html'), self.banner_view) self.error_view = BaseInsertView(self.jinja_env, self._html_templ('error_html')) self.not_found_view = BaseInsertView(self.jinja_env, self._html_templ('not_found_html')) self.query_view = BaseInsertView(self.jinja_env, self._html_templ('query_html')) self.use_js_obj_proxy = config.get('use_js_obj_proxy', True) self.cookie_tracker = None self.enable_memento = self.config.get('enable_memento') csp_header = self.config.get('csp-header', self.DEFAULT_CSP) if csp_header: self.csp_header = ('Content-Security-Policy', csp_header) else: self.csp_header = None # deprecated: Use X-Forwarded-Proto header instead! self.force_scheme = config.get('force_scheme')
def lookupRecord(url): """ Look up URL in database. """ try: filename, offset, length = urlmap[url] with open(filename, 'rb') as stream: stream.seek(offset, 0) buf = BytesIO(stream.read(length)) loader = ArcWarcRecordLoader() return loader.parse_record_stream(DecompressingBufferedReader(buf)) except KeyError: return None
def test_generate_record(self, record_sampler, is_gzip, builder_factory): writer = FixedTestWARCWriter(gzip=is_gzip) builder = builder_factory(writer) record_maker, record_string = record_sampler record = record_maker(builder) writer.write_record(record) raw_buff = writer.get_contents() self._validate_record_content_len(BytesIO(raw_buff)) stream = DecompressingBufferedReader(writer.get_stream()) buff = stream.read() if is_gzip: assert len(buff) > len(raw_buff) else: assert len(buff) == len(raw_buff) assert buff.decode('utf-8') == record_string # assert parsing record matches as well stream = DecompressingBufferedReader(writer.get_stream()) parsed_record = ArcWarcRecordLoader().parse_record_stream(stream) writer2 = FixedTestWARCWriter(gzip=False) writer2.write_record(parsed_record) assert writer2.get_contents().decode('utf-8') == record_string # verify parts of record stream = DecompressingBufferedReader(writer.get_stream()) parsed_record = ArcWarcRecordLoader().parse_record_stream(stream) content_buff = parsed_record.content_stream().read().decode('utf-8') assert content_buff in record_string rec_type = parsed_record.rec_type # verify http_headers # match original assert record.http_headers == parsed_record.http_headers if parsed_record.http_headers: assert rec_type in ('response', 'request', 'revisit') else: # empty revisit if rec_type == 'revisit': assert len(content_buff) == 0 else: assert len(content_buff) == parsed_record.length
def fetch_warc_record(capture, warc_download_prefix): for field in ('url', 'filename', 'offset', 'length'): if field not in capture: # pragma: no cover raise ValueError('capture must contain '+field) url = capture['url'] filename = capture['filename'] offset = int(capture['offset']) length = int(capture['length']) warc_url = warc_download_prefix + '/' + filename headers = {'Range': 'bytes={}-{}'.format(offset, offset+length-1)} resp = myrequests_get(warc_url, headers=headers) record_bytes = resp.content stream = DecompressingBufferedReader(BytesIO(record_bytes)) record = ArcWarcRecordLoader().parse_record_stream(stream) for header in ('WARC-Source-URI', 'WARC-Source-Range'): if record.rec_headers.get_header(header): # pragma: no cover print('Surprised that {} was already set in this WARC record'.format(header), file=sys.stderr) warc_target_uri = record.rec_headers.get_header('WARC-Target-URI') if url != warc_target_uri: # pragma: no cover print('Surprised that WARC-Target-URI {} is not the capture url {}'.format(warc_target_uri, url), file=sys.stderr) record.rec_headers.replace_header('WARC-Source-URI', warc_url) record.rec_headers.replace_header('WARC-Source-Range', 'bytes={}-{}'.format(offset, offset+length-1)) return record
def test_record_skip_all_cookies_header(self): warc_path = to_path(self.root_dir + '/warcs/cookieskip/') header_filter = ExcludeSpecificHeaders(['Set-Cookie', 'Cookie']) recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter( warc_path, header_filter=header_filter), accept_colls='live') resp = self._test_warc_write( recorder_app, 'httpbin.org', '/cookies/set%3Fname%3Dvalue%26foo%3Dbar') assert b'HTTP/1.1 302' in resp.body buff = BytesIO(resp.body) record = ArcWarcRecordLoader().parse_record_stream(buff) assert ('Set-Cookie', 'name=value; Path=/') in record.http_headers.headers assert ('Set-Cookie', 'foo=bar; Path=/') in record.http_headers.headers stored_req, stored_resp = self._load_resp_req(warc_path) assert ('Set-Cookie', 'name=value; Path=/') not in stored_resp.http_headers.headers assert ('Set-Cookie', 'foo=bar; Path=/') not in stored_resp.http_headers.headers assert ('X-Other', 'foo') in stored_req.http_headers.headers assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers self._test_all_warcs('/warcs/cookieskip/', 1)
def test_record_skip_http_only_cookies_header(self): warc_path = to_path(self.root_dir + '/warcs/cookieskip_httponly/') header_filter = ExcludeHttpOnlyCookieHeaders() recorder_app = RecorderApp(self.upstream_url, PerRecordWARCWriter( warc_path, header_filter=header_filter), accept_colls='live') resp = self._test_warc_write(recorder_app, 'www.google.com', '/') assert b'HTTP/1.1 302' in resp.body buff = BytesIO(resp.body) record = ArcWarcRecordLoader().parse_record_stream(buff) non_http_only, http_only = self._get_http_only_cookies(record) # both httponly and other cookies assert http_only != None assert non_http_only != None stored_req, stored_resp = self._load_resp_req(warc_path) non_http_only, http_only = self._get_http_only_cookies(stored_resp) # no httponly cookies assert http_only == None assert non_http_only != None assert ('X-Other', 'foo') in stored_req.http_headers.headers assert ('Cookie', 'boo=far') not in stored_req.http_headers.headers self._test_all_warcs('/warcs/cookieskip_httponly/', 1)
def test_warcinfo_record(self): simplewriter = FixedTestWARCWriter(gzip=False) params = OrderedDict([('software', 'recorder test'), ('format', 'WARC File Format 1.0'), ('invalid', ''), ('json-metadata', json.dumps({'foo': 'bar'}))]) record = simplewriter.create_warcinfo_record('testfile.warc.gz', params) simplewriter.write_record(record) buff = simplewriter.get_contents() assert isinstance(buff, bytes) buff = BytesIO(buff) parsed_record = ArcWarcRecordLoader().parse_record_stream(buff) assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo' assert parsed_record.rec_headers.get_header( 'Content-Type') == 'application/warc-fields' assert parsed_record.rec_headers.get_header( 'WARC-Filename') == 'testfile.warc.gz' buff = parsed_record.raw_stream.read().decode('utf-8') length = parsed_record.rec_headers.get_header('Content-Length') assert len(buff) == int(length) assert 'json-metadata: {"foo": "bar"}\r\n' in buff assert 'format: WARC File Format 1.0\r\n' in buff assert simplewriter.get_contents().decode('utf-8') == WARCINFO_RECORD
def test_record_video_metadata(self): pytest.importorskip('youtube_dl') warc_path = to_path(self.root_dir + '/warcs/{user}/{coll}/') dedup_index = self._get_dedup_index() writer = PerRecordWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) params = {'param.recorder.user': '******', 'param.recorder.coll': 'VIDEO', 'content_type': 'application/vnd.youtube-dl_formats+json' } resp = self._test_warc_write(recorder_app, 'www.youtube.com', '/v/BfBgWtAIbRc', '&' + urlencode(params), link_url='metadata://www.youtube.com/v/BfBgWtAIbRc') r = FakeStrictRedis.from_url('redis://localhost/2') warcs = r.hgetall('USER:VIDEO:warc') assert len(warcs) == 1 filename = list(warcs.values())[0] with open(filename, 'rb') as fh: decomp = DecompressingBufferedReader(fh) record = ArcWarcRecordLoader().parse_record_stream(decomp) status_headers = record.rec_headers assert status_headers.get_header('WARC-Type') == 'metadata' assert status_headers.get_header('Content-Type') == 'application/vnd.youtube-dl_formats+json' assert status_headers.get_header('WARC-Block-Digest') != '' assert status_headers.get_header('WARC-Block-Digest') == status_headers.get_header('WARC-Payload-Digest')
def test_record_custom_record(self): dedup_index = self._get_dedup_index(user=False) warc_path = to_path(self.root_dir + '/warcs/meta/meta.warc.gz') writer = MultiFileWARCWriter(warc_path, dedup_index=dedup_index) recorder_app = RecorderApp(self.upstream_url, writer) req_url = '/live/resource/postreq?url=custom://httpbin.org¶m.recorder.coll=META&put_record=resource' buff = b'Some Data' testapp = webtest.TestApp(recorder_app) headers = {'content-type': 'text/plain', 'WARC-Custom': 'foo'} resp = testapp.put(req_url, headers=headers, params=buff) assert resp.json['success'] == 'true' assert resp.json['WARC-Date'] != '' self._test_all_warcs('/warcs/meta', 1) r = FakeStrictRedis.from_url('redis://localhost/2') warcs = r.hgetall('META:warc') assert len(warcs) == 1 warc_key = os.path.join('meta', 'meta.warc.gz').encode('utf-8') with open(warcs[warc_key], 'rb') as fh: decomp = DecompressingBufferedReader(fh) record = ArcWarcRecordLoader().parse_record_stream( decomp, ensure_http_headers=True) status_headers = record.rec_headers assert len(record.rec_headers.headers) == 9 assert status_headers.get_header('WARC-Type') == 'resource' assert status_headers.get_header( 'WARC-Target-URI') == 'custom://httpbin.org' assert status_headers.get_header('WARC-Record-ID') != '' assert status_headers.get_header('WARC-Date') != '' assert status_headers.get_header('WARC-Block-Digest') != '' assert status_headers.get_header( 'WARC-Block-Digest') == status_headers.get_header( 'WARC-Payload-Digest') assert status_headers.get_header('Content-Type') == 'text/plain' assert status_headers.get_header('Content-Length') == str(len(buff)) assert status_headers.get_header('WARC-Custom') == 'foo' assert record.raw_stream.read() == buff status_headers = record.http_headers assert len(record.http_headers.headers) == 2 assert status_headers.get_header('Content-Type') == 'text/plain' assert status_headers.get_header('Content-Length') == str(len(buff)) writer.close() assert len(writer.fh_cache) == 0
def test_agg_select_mem_unrewrite_headers(self): resp = self.testapp.get('/cdx_api/resource?closest=20161103124134&url=http://iana.org/') assert resp.headers['Warcserver-Source-Coll'] == 'ia-cdx' buff = BytesIO(resp.body) record = ArcWarcRecordLoader().parse_record_stream(buff, no_record_parse=False) print(record.http_headers) assert record.http_headers.get_statuscode() == '200'
def get_rendered_original_stream(warc_filename, warc_offset, compressedendoffset): # If not found, say so: if warc_filename is None: return None, None # Grab the payload from the WARC and return it. url = "%s%s?op=OPEN&user.name=%s&offset=%s" % ( WEBHDFS_PREFIX, warc_filename, WEBHDFS_USER, warc_offset) if compressedendoffset: url = "%s&length=%s" % (url, compressedendoffset) r = requests.get(url, stream=True) logger.debug("Loading from: %s" % r.url) r.raw.decode_content = False rl = ArcWarcRecordLoader() record = rl.parse_record_stream(DecompressingBufferedReader(stream=r.raw)) return record.raw_stream, record.content_type
def test_read_from_stream_no_content_length(self, record_sampler, is_gzip, builder_factory): writer = FixedTestWARCWriter(gzip=is_gzip) builder = builder_factory(writer) record_maker, record_string = record_sampler full_record = record_maker(builder) stream = BytesIO() record_no_cl = self._conv_to_streaming_record(record_string, full_record.rec_type) if is_gzip: gzip_stream = GzippingWrapper(stream) gzip_stream.write(record_no_cl.encode('utf-8')) gzip_stream.flush() else: stream.write(record_no_cl.encode('utf-8')) # parse to verify http headers + payload matches sample record # but not rec headers (missing content-length) stream.seek(0) parsed_record = ArcWarcRecordLoader().parse_record_stream( DecompressingBufferedReader(stream)) if 'Content-Disposition' not in record_string: assert full_record.http_headers == parsed_record.http_headers assert full_record.raw_stream.read() == parsed_record.raw_stream.read() assert full_record.rec_headers != parsed_record.rec_headers # parse and write stream.seek(0) parsed_record = ArcWarcRecordLoader().parse_record_stream( DecompressingBufferedReader(stream)) writer.write_record(parsed_record) stream = DecompressingBufferedReader(writer.get_stream()) buff = stream.read() # assert written record matches expected response record # with content-length, digests computed assert buff.decode('utf-8') == record_string
def create_warc_record(self, uri, record_type, payload=None, length=None, warc_content_type='', warc_headers_dict=None, warc_headers=None, http_headers=None): if warc_headers_dict is None: warc_headers_dict = dict() if payload and not http_headers: loader = ArcWarcRecordLoader() http_headers = loader.load_http_headers(record_type, uri, payload, length) if http_headers and length is not None: length -= payload.tell() if not payload: payload = BytesIO() length = 0 if not warc_headers: warc_headers = self._init_warc_headers(uri, record_type, warc_headers_dict) # compute Content-Type if not warc_content_type: warc_content_type = warc_headers.get_header('Content-Type') if not warc_content_type: warc_content_type = self.WARC_RECORDS.get( record_type, 'application/warc-record') record = ArcWarcRecord('warc', record_type, warc_headers, payload, http_headers, warc_content_type, length) record.payload_length = length self.ensure_digest(record, block=False, payload=True) return record
def test_warcinfo_record(self, is_gzip): writer = FixedTestWARCWriter(gzip=is_gzip) record = sample_warcinfo(writer) writer.write_record(record) reader = DecompressingBufferedReader(writer.get_stream()) parsed_record = ArcWarcRecordLoader().parse_record_stream(reader) assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo' assert parsed_record.rec_headers.get_header( 'Content-Type') == 'application/warc-fields' assert parsed_record.rec_headers.get_header( 'WARC-Filename') == 'testfile.warc.gz' buff = parsed_record.content_stream().read().decode('utf-8') assert 'json-metadata: {"foo": "bar"}\r\n' in buff assert 'format: WARC File Format 1.0\r\n' in buff
def test_live_1(self): resp = requests.get(self.base_url + '/live/resource?url=http://httpbin.org/get', stream=True) assert resp.headers['Warcserver-Source-Coll'] == 'live' record = ArcWarcRecordLoader().parse_record_stream( resp.raw, no_record_parse=False) assert record.rec_headers.get_header( 'WARC-Target-URI') == 'http://httpbin.org/get' assert record.http_headers.get_header('Date') != ''
def test_upstream_1(self): resp = self.testapp.get( '/upstream/resource?url=http://httpbin.org/get') assert resp.headers['Warcserver-Source-Coll'] == 'upstream:live' raw = BytesIO(resp.body) record = ArcWarcRecordLoader().parse_record_stream( raw, no_record_parse=False) assert record.rec_headers.get_header( 'WARC-Target-URI') == 'http://httpbin.org/get' assert record.http_headers.get_header('Date') != ''
def test_warcinfo_record(self, is_gzip, builder_factory): writer = FixedTestWARCWriter(gzip=is_gzip) builder = builder_factory(writer) record = sample_warcinfo(builder) writer.write_record(record) reader = DecompressingBufferedReader(writer.get_stream()) parsed_record = ArcWarcRecordLoader().parse_record_stream(reader) assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo' assert parsed_record.rec_headers.get_header( 'Content-Type') == 'application/warc-fields' assert parsed_record.rec_headers.get_header( 'WARC-Filename') == 'testfile.warc.gz' assert parsed_record.rec_headers.get_header( 'WARC-Block-Digest') == 'sha1:GAD6P5BTZPRU57ICXEYUJZGCURZYABID' buff = parsed_record.content_stream().read().decode('utf-8') assert 'json-metadata: {"foo": "bar"}\r\n' in buff assert 'format: WARC File Format 1.0\r\n' in buff
def test_utf8_rewrite_content_adjust(self): UTF8_PAYLOAD = u'\ HTTP/1.0 200 OK\r\n\ Content-Type: text/plain; charset="UTF-8"\r\n\ Content-Disposition: attachment; filename="испытание.txt"\r\n\ Custom-Header: somevalue\r\n\ Unicode-Header: %F0%9F%93%81%20text%20%F0%9F%97%84%EF%B8%8F\r\n\ \r\n\ some\n\ text' content_length = len(UTF8_PAYLOAD.encode('utf-8')) UTF8_RECORD = u'\ WARC/1.0\r\n\ WARC-Type: response\r\n\ WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\ WARC-Target-URI: http://example.com/\r\n\ WARC-Date: 2000-01-01T00:00:00Z\r\n\ WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\ WARC-Block-Digest: sha1:KMUABC6URWIQ7QXCZDQ5FS6WIBBFRORR\r\n\ Content-Type: application/http; msgtype=response\r\n\ Content-Length: {0}\r\n\ \r\n\ {1}\r\n\ \r\n\ '.format(content_length, UTF8_PAYLOAD) assert (content_length == 226) record = ArcWarcRecordLoader().parse_record_stream( BytesIO(UTF8_RECORD.encode('utf-8'))) writer = BufferWARCWriter(gzip=False) writer.write_record(record) raw_buff = writer.get_contents() assert raw_buff.decode('utf-8') == RESPONSE_RECORD_UNICODE_HEADERS for record in ArchiveIterator(writer.get_stream()): assert record.length == 268
def _write_one(self): req_pay = None resp_pay = None try: result = self.write_queue.get() req_head, req_pay, resp_head, resp_pay, params = result resp_length = resp_pay.tell() resp_pay.seek(0) resp = ArcWarcRecordLoader().parse_record_stream(resp_pay) if resp.rec_type == 'response': uri = resp.rec_headers.get_header('WARC-Target-Uri') req_length = req_pay.tell() req_pay.seek(0) req = self.writer.create_warc_record( uri=uri, record_type='request', payload=req_pay, length=req_length, warc_headers_dict=req_head) self.writer.write_request_response_pair(req, resp, params) else: self.writer.write_record(resp, params) finally: try: if req_pay: no_except_close(req_pay) if resp_pay: no_except_close(resp_pay) except Exception as e: traceback.print_exc()
class DirectUpstream(object): def __init__(self, upstream_url_resolver, proxy_magic='pywb.proxy', magic_fwd='http://localhost/', assets_path=None, is_rw=True): self.upstream_url_resolver = upstream_url_resolver self.loader = ArcWarcRecordLoader() self.proxy_magic = proxy_magic self.fwd_scheme, self.fwd_host, self.fwd_port, self.fwd_path = parse( magic_fwd) self.fwd_scheme = self.fwd_scheme.decode('latin-1') self.fwd_host = self.fwd_host.decode('latin-1') self.fwd_path = self.fwd_path.decode('latin-1') self.jinja_env = JinjaEnv(assets_path=assets_path) self.head_insert_view = HeadInsertView(self.jinja_env, 'head_insert.html', 'banner.html') self.error_view = BaseInsertView(self.jinja_env, 'error.html') self.home_redir_view = BaseInsertView(self.jinja_env, 'home.html') if is_rw: self.content_rewriter = Rewriter(is_framed_replay=False) else: self.content_rewriter = None def request(self, flow): self._set_request_url(flow) def _set_request_url(self, flow, postreq=''): host = flow.request.headers.get('host') if not host: host = flow.request.host homepage_redirect = None if (host == self.proxy_magic and (flow.request.path in (H_REFRESH_PATH, H_REDIR_PATH))): homepage_redirect = flow.request.path elif host == self.proxy_magic: flow.request.host = self.fwd_host flow.request.scheme = self.fwd_scheme flow.request.port = self.fwd_port flow.request.headers['X-Proxy-For'] = str( flow.client_conn.address.host) return False if host: host = flow.request.scheme + '://' + host else: host = hostport(flow.request.scheme, flow.request.host, flow.request.port) req_url = host + flow.request.path flow.request.req_url = req_url flow.request.req_scheme = flow.request.scheme result = self.upstream_url_resolver(url=quote_plus(req_url), headers=flow.request.headers, address=flow.client_conn.address, postreq=postreq) full_url, extra_data = result if homepage_redirect: url = extra_data.get('url') if url: if homepage_redirect == H_REFRESH_PATH: self.homepage_refresh(flow, url) elif homepage_redirect == H_REDIR_PATH: self.homepage_redir(flow, url) return False scheme, host, port, path = parse(full_url) flow.request.scheme = scheme flow.request.host = host flow.request.port = port flow.request.path = path flow.extra_data = extra_data return True def responseheaders(self, flow): if flow.request.host == self.fwd_host: return if hasattr(flow, 'direct_response'): return if flow.response.status_code == 200: flow.response.stream = True def response(self, flow): if flow.request.host == self.fwd_host: return if hasattr(flow, 'direct_response'): return if flow.response.status_code != 200: url = flow.request.req_url err_status = 400 err_msg = 'Proxy Error' if flow.response.status_code == 404: err_status = 404 err_msg = 'Not Found' self.send_error(flow, url, err_status, err_msg) return an_iter = flow.live.read_response_body(flow.request, flow.response) stream = IterIO(an_iter) try: self._set_response(flow, stream) except Exception as e: if hasattr(flow.request, 'req_url'): print(flow.request.req_url) print(type(e), e) import traceback traceback.print_exc() def homepage_redir(self, flow, redir_url): flow.request.host = self.fwd_host flow.response = HTTPResponse.make(303, b'', {'Location': redir_url}) return True def homepage_refresh(self, flow, url): flow.direct_response = True environ = {} environ['webrec.template_params'] = {'url': url} resp_data = self.home_redir_view.render_to_string(environ).encode( 'utf-8') flow.response = HTTPResponse.make( 200, resp_data, {'Content-Type': 'text/html; charset=utf-8'}) return True def send_error(self, flow, url, status, reason): template_params = {} if hasattr(flow, 'extra_data') and flow.extra_data: template_params = flow.extra_data template_params['url'] = url template_params['cdx'] = {'url': url} template_params['proxy_magic'] = self.proxy_magic host_prefix = flow.request.req_scheme + '://' + self.proxy_magic template_params['wbrequest'] = {'host_prefix': host_prefix} environ = { 'pywb_proxy_magic': self.proxy_magic, 'webrec.template_params': template_params } msg = self.error_view.render_to_string(environ).encode('utf-8') flow.response.content = msg flow.response.status_code = status flow.response.reason = reason flow.response.headers = Headers() flow.response.headers['Content-Type'] = 'text/html; charset=utf-8' flow.response.headers['Content-Length'] = str(len(msg)) def process_record(self, record, flow): headers = flow.response.headers url = flow.request.req_url scheme = flow.request.req_scheme if not self.content_rewriter: return record.http_headers, StreamIO(record.raw_stream) cookie_rewriter = None template_params = flow.extra_data environ = { 'pywb_proxy_magic': self.proxy_magic, 'webrec.template_params': template_params } wb_url = WbUrl(url) wb_prefix = '' host_prefix = flow.request.req_scheme + '://' + self.proxy_magic urlrewriter = SchemeOnlyUrlRewriter(wb_url, '') if flow.request.headers.get('X-Requested-With', '').lower() == 'xmlhttprequest': urlrewriter.rewrite_opts['is_ajax'] = True head_insert_func = (self.head_insert_view.create_insert_func( wb_url, wb_prefix, host_prefix, url, environ, False)) urlkey = canonicalize(wb_url.url) cdx = CDXObject() cdx['urlkey'] = urlkey cdx['timestamp'] = http_date_to_timestamp( headers.get('Memento-Datetime')) cdx['url'] = wb_url.url if headers.get('Webagg-Source-Coll') == 'live': cdx['is_live'] = 'true' result = self.content_rewriter.rewrite_content( urlrewriter, record.http_headers, record.raw_stream, head_insert_func, urlkey, cdx, cookie_rewriter, environ) status_headers, gen, is_rw = result status_headers.remove_header('Content-Security-Policy') # check for content-length res = status_headers.get_header('content-length') try: if int(res) > 0: return status_headers, IterIdent(gen) except: pass # need to either chunk or buffer to get content-length if flow.request.http_version == 'HTTP/1.1': status_headers.remove_header('content-length') status_headers.headers.append(('Transfer-Encoding', 'chunked')) #gen = chunk_encode_iter(gen) else: gen = buffer_iter(status_headers, gen) return status_headers, IterIdent(gen) def _set_response(self, flow, stream): record = self.loader.parse_record_stream(stream) status_headers, gen = self.process_record(record, flow) if status_headers: headers_bytes = [(n.encode('iso-8859-1'), v.encode('iso-8859-1')) for n, v in status_headers.headers] flow.response.headers = Headers(headers_bytes) protocol = status_headers.protocol status, reason = status_headers.statusline.split(' ', 1) flow.response.status_code = int(status) flow.response.reason = reason flow.response.stream = gen def serverconnect(self, server_conn): return def error(self, flow): if hasattr(flow.request, 'req_url'): url = flow.request.req_url else: url = '' print('ERROR', url)
class RewriterApp(object): """Primary application for rewriting the content served by pywb (if it is to be rewritten). This class is also responsible rendering the archives templates """ VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json' DEFAULT_CSP = "default-src 'unsafe-eval' 'unsafe-inline' 'self' data: blob: mediastream: ws: wss: ; form-action 'self'" def __init__(self, framed_replay=False, jinja_env=None, config=None, paths=None): """Initialize a new instance of RewriterApp :param bool framed_replay: Is rewriting happening in framed replay mode :param JinjaEnv|None jinja_env: Optional JinjaEnv instance to be used for rendering static files :param dict|None config: Optional config dictionary :param dict|None paths: Optional dictionary containing a mapping of path names to URLs """ self.loader = ArcWarcRecordLoader() self.config = config or {} self.paths = paths or {} self.framed_replay = framed_replay if framed_replay: self.frame_mod = '' self.replay_mod = 'mp_' else: self.frame_mod = None self.replay_mod = '' self.enable_prefer = self.config.get('enable_prefer', False) self.default_rw = DefaultRewriter(replay_mod=self.replay_mod, config=config) self.js_proxy_rw = RewriterWithJSProxy(replay_mod=self.replay_mod) if not jinja_env: jinja_env = JinjaEnv( globals={'static_path': 'static'}, extensions=['jinja2.ext.i18n', 'jinja2.ext.with_']) jinja_env.jinja_env.install_null_translations() self.jinja_env = jinja_env self.loc_map = {} self.jinja_env.init_loc(self.config.get('locales_root_dir'), self.config.get('locales'), self.loc_map, self.config.get('default_locale')) self.redirect_to_exact = config.get('redirect_to_exact') self.banner_view = BaseInsertView(self.jinja_env, self._html_templ('banner_html')) self.head_insert_view = HeadInsertView( self.jinja_env, self._html_templ('head_insert_html'), self.banner_view) self.frame_insert_view = TopFrameView( self.jinja_env, self._html_templ('frame_insert_html'), self.banner_view) self.error_view = BaseInsertView(self.jinja_env, self._html_templ('error_html')) self.not_found_view = BaseInsertView( self.jinja_env, self._html_templ('not_found_html')) self.query_view = BaseInsertView(self.jinja_env, self._html_templ('query_html')) self.use_js_obj_proxy = config.get('use_js_obj_proxy', True) self.cookie_tracker = self._init_cookie_tracker() self.enable_memento = self.config.get('enable_memento') self.static_prefix = self.config.get('static_prefix', 'static') csp_header = self.config.get('csp-header', self.DEFAULT_CSP) if csp_header: self.csp_header = ('Content-Security-Policy', csp_header) else: self.csp_header = None # deprecated: Use X-Forwarded-Proto header instead! self.force_scheme = config.get('force_scheme') def _init_cookie_tracker(self, redis=None): """Initialize the CookieTracker :param redis: Optional redis instance to be used Defaults to FakeStrictRedis :return: The initialized cookie tracker :rtype: CookieTracker """ if redis is None: redis = FakeStrictRedis() return CookieTracker(redis) def add_csp_header(self, wb_url, status_headers): """Adds Content-Security-Policy headers to the supplied StatusAndHeaders instance if the wb_url's mod is equal to the replay mod :param WbUrl wb_url: The WbUrl for the URL being operated on :param warcio.StatusAndHeaders status_headers: The status and headers instance for the reply to the URL """ if self.csp_header and wb_url.mod == self.replay_mod: status_headers.headers.append(self.csp_header) def _html_templ(self, name): """Returns the html file name for the supplied html template name. :param str name: The name of the html template :return: The file name for the template :rtype: str|None """ value = self.config.get(name) if not value: value = name.replace('_html', '.html') return value def is_framed_replay(self, wb_url): """Returns T/F indicating if the rewriter app is configured to be operating in framed replay mode and the supplied WbUrl is also operating in framed replay mode :param WbUrl wb_url: The WbUrl instance to check :return: T/F if in framed replay mode :rtype: bool """ return (self.framed_replay and wb_url.mod == self.frame_mod and wb_url.is_replay()) def _check_accept_dt(self, wb_url, environ): """Returns T/F indicating if the supplied WbUrl instance is for a timegate request :param WbUrl wb_url: The URL to be checked :param dict environ: The wsgi environment object for the request :return: T/F indicating if the WbUrl is for timegate request :rtype: bool """ is_timegate = False if wb_url.is_latest_replay(): accept_dt = environ.get('HTTP_ACCEPT_DATETIME') is_timegate = True if accept_dt: try: wb_url.timestamp = http_date_to_timestamp(accept_dt) except Exception: raise UpstreamException(400, url=wb_url.url, details='Invalid Accept-Datetime') # return WbResponse.text_response('Invalid Accept-Datetime', status='400 Bad Request') wb_url.type = wb_url.REPLAY elif 'pywb_proxy_default_timestamp' in environ: wb_url.timestamp = environ['pywb_proxy_default_timestamp'] wb_url.type = wb_url.REPLAY return is_timegate def _get_prefer_mod(self, wb_url, environ, content_rw, is_proxy): """Returns the default rewrite modifier and rewrite modifier based on the value of the Prefer HTTP header if it is present :param WbUrl wb_url: The WbUrl for the URL being rewritten :param dict environ: The WSGI environment dictionary for the request :param content_rw: The content rewriter instance :param bool is_proxy: Is the rewrite operating in proxy mode :return: A tuple containing the default rewrite modifier and rewrite modifier based on the value of the Prefer HTTP header if it is present :rtype: tuple[str|None, str|None] """ if not self.enable_prefer: return None, None prefer = environ.get('HTTP_PREFER') if not prefer: return None, content_rw.mod_to_prefer(wb_url.mod) mod = content_rw.prefer_to_mod(prefer) if mod is None: raise UpstreamException(400, url=wb_url.url, details='Invalid Prefer: ' + prefer) if is_proxy and mod == self.replay_mod: mod = 'bn_' prefer = content_rw.mod_to_prefer('bn_') return mod, prefer def _check_range(self, inputreq, wb_url): """Checks the input request if it is a range request returning the start and end of the range as well as T/F if the request should be skipped as a tuple. :param RewriteInputRequest inputreq: The input request to check range :param WbUrl wb_url: The WbUrl associated with the request :return: A tuple with the start, end, and T/F should skip request :rtype: tuple[int|None, int|None, bool] """ skip_record = False range_start = None range_end = None rangeres = inputreq.extract_range() if not rangeres: return range_start, range_end, skip_record mod_url, start, end, use_206 = rangeres # remove the range and still proxy if not use_206: return range_start, range_end, skip_record wb_url.url = mod_url inputreq.url = mod_url range_start = start range_end = end # if start with 0, load from upstream, but add range after if start == 0: del inputreq.env['HTTP_RANGE'] else: skip_record = True return range_start, range_end, skip_record def _add_range(self, record, wb_url, range_start, range_end): if range_end is None and range_start is None: return if record.http_headers.get_statuscode() != '200': return content_length = (record.http_headers.get_header('Content-Length')) if content_length is None: return content_length = content_length.split(',')[0] try: content_length = int(content_length) if not range_end: range_end = content_length - 1 if range_start >= content_length or range_end >= content_length: details = 'Invalid Range: {0} >= {2} or {1} >= {2}'.format( range_start, range_end, content_length) raise UpstreamException(416, url=wb_url.url, details=details) range_len = range_end - range_start + 1 record.http_headers.add_range(range_start, range_len, content_length) record.http_headers.replace_header('Content-Length', str(range_len)) record.raw_stream = OffsetLimitReader(record.raw_stream, range_start, range_len) return True except (ValueError, TypeError): pass def send_redirect(self, new_path, url_parts, urlrewriter): scheme, netloc, path, query, frag = url_parts path = new_path url = urlunsplit((scheme, netloc, path, query, frag)) resp = WbResponse.redir_response(urlrewriter.rewrite(url), '307 Temporary Redirect') if self.enable_memento: resp.status_headers['Link'] = MementoUtils.make_link( url, 'original') return resp def prepare_env(self, environ): """ setup environ path prefixes and scheme """ if 'pywb.host_prefix' in environ: return proto = environ.get('HTTP_X_FORWARDED_PROTO', self.force_scheme) if proto: environ['wsgi.url_scheme'] = proto environ['pywb.host_prefix'] = self.get_host_prefix(environ) environ['pywb.app_prefix'] = environ.get('SCRIPT_NAME', '') environ['pywb.static_prefix'] = environ['pywb.host_prefix'] + environ[ 'pywb.app_prefix'] + '/' + self.static_prefix def render_content(self, wb_url, kwargs, environ): wb_url = wb_url.replace('#', '%23') wb_url = WbUrl(wb_url) history_page = environ.pop('HTTP_X_WOMBAT_HISTORY_PAGE', '') if history_page: wb_url.url = history_page is_ajax = True else: is_ajax = self.is_ajax(environ) is_timegate = self._check_accept_dt(wb_url, environ) self.prepare_env(environ) host_prefix = environ['pywb.host_prefix'] rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix pywb_static_prefix = environ['pywb.static_prefix'] + '/' is_proxy = ('wsgiprox.proxy_host' in environ) # if OPTIONS in proxy mode, just generate the proxy responss if is_proxy and self.is_preflight(environ): return WbResponse.options_response(environ) if self.use_js_obj_proxy: content_rw = self.js_proxy_rw else: content_rw = self.default_rw # no redirects if in proxy redirect_to_exact = self.redirect_to_exact and not is_proxy # Check Prefer pref_mod, pref_applied = self._get_prefer_mod(wb_url, environ, content_rw, is_proxy) response = None keep_frame_response = False # prefer overrides custom response? if pref_mod is not None: # fast-redirect to preferred if redirect_to_exact and not is_timegate and pref_mod != wb_url.mod: new_url = full_prefix + wb_url.to_str(mod=pref_mod) headers = [('Preference-Applied', pref_applied), ('Vary', 'Prefer')] return WbResponse.redir_response(new_url, '307 Temporary Redirect', headers=headers) else: wb_url.mod = pref_mod else: if kwargs.get('output'): response = self.handle_timemap(wb_url, kwargs, full_prefix) elif wb_url.is_query(): response = self.handle_query(environ, wb_url, kwargs, full_prefix) else: response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) keep_frame_response = (not kwargs.get('no_timegate_check') and is_timegate and not is_proxy) or redirect_to_exact if response and not keep_frame_response: return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy) if is_proxy: environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] urlrewriter = IdentityUrlRewriter(wb_url, '') framed_replay = False else: urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix, pywb_static_prefix=pywb_static_prefix) framed_replay = self.framed_replay url_parts = urlsplit(wb_url.url) if not url_parts.path: return self.send_redirect('/', url_parts, urlrewriter) self.unrewrite_referrer(environ, full_prefix) urlkey = canonicalize(wb_url.url) inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) inputreq.include_method_query(wb_url.url) range_start, range_end, skip_record = self._check_range( inputreq, wb_url) setcookie_headers = None cookie_key = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) if cookie_key: res = self.cookie_tracker.get_cookie_headers( wb_url.url, urlrewriter, cookie_key, environ.get('HTTP_COOKIE', '')) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip_record) if r.status_code >= 400: error = None try: error = r.raw.read() except Exception: pass finally: no_except_close(r.raw) if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) if r.status_code == 404: raise NotFoundException(url=wb_url.url, msg=details) else: raise UpstreamException(r.status_code, url=wb_url.url, details=details) cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8')) cdx_url_parts = urlsplit(cdx['url']) if cdx_url_parts.path.endswith( '/') and not url_parts.path.endswith('/'): # add trailing slash new_path = url_parts.path + '/' no_except_close(r.raw) return self.send_redirect(new_path, url_parts, urlrewriter) # only redirect to exact if not live, otherwise set to false redirect_to_exact = redirect_to_exact and not cdx.get('is_live') # return top-frame timegate response, with timestamp from cdx if response and keep_frame_response and (not redirect_to_exact or not is_timegate): no_except_close(r.raw) return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy, cdx['timestamp']) stream = BufferedReader(r.raw, block_size=BUFF_SIZE) record = self.loader.parse_record_stream(stream, ensure_http_headers=True) memento_dt = r.headers.get('Memento-Datetime') target_uri = r.headers.get('WARC-Target-URI') # cdx['urlkey'] = urlkey # cdx['timestamp'] = http_date_to_timestamp(memento_dt) # cdx['url'] = target_uri set_content_loc = False # Check if Fuzzy Match if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1': set_content_loc = True # if redirect to exact timestamp (only set if not live) if redirect_to_exact: if set_content_loc or is_timegate or wb_url.timestamp != cdx.get( 'timestamp'): new_url = urlrewriter.get_new_url(url=target_uri, timestamp=cdx['timestamp'], mod=wb_url.mod) resp = WbResponse.redir_response(new_url, '307 Temporary Redirect') if self.enable_memento: if is_timegate and not is_proxy: self._add_memento_links(target_uri, full_prefix, memento_dt, cdx['timestamp'], resp.status_headers, is_timegate, is_proxy, pref_applied=pref_applied, mod=pref_mod, is_memento=False) else: resp.status_headers['Link'] = MementoUtils.make_link( target_uri, 'original') return resp self._add_custom_params(cdx, r.headers, kwargs, record) if self._add_range(record, wb_url, range_start, range_end): wb_url.mod = 'id_' if is_ajax: head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view.create_insert_func( wb_url, full_prefix, host_prefix, top_url, environ, framed_replay, coll=kwargs.get('coll', ''), replay_mod=self.replay_mod, metadata=kwargs.get('metadata', {}), config=self.config)) cookie_rewriter = None if self.cookie_tracker and cookie_key: # skip add cookie if service worker is not 200 # it seems cookie headers from service workers are not applied, so don't update in cache if wb_url.mod == 'sw_': cookie_key = None cookie_rewriter = self.cookie_tracker.get_rewriter( urlrewriter, cookie_key) urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx, environ) status_headers, gen, is_rw = result if history_page: title = DefaultRewriter._extract_title(gen) if not title: title = unquote(environ.get('HTTP_X_WOMBAT_HISTORY_TITLE', '')) if not title: title = history_page self._add_history_page(cdx, kwargs, title) return WbResponse.json_response({'title': title}) if setcookie_headers: status_headers.headers.extend(setcookie_headers) if ' ' not in status_headers.statusline: status_headers.statusline += ' None' if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, is_timegate, is_proxy, cdx.get('source-coll'), mod=pref_mod, pref_applied=pref_applied) set_content_loc = True if set_content_loc and not redirect_to_exact and not is_proxy: status_headers.headers.append( ('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) if not is_proxy: self.add_csp_header(wb_url, status_headers) response = WbResponse(status_headers, gen) if is_proxy and environ.get('HTTP_ORIGIN'): response.add_access_control_headers(environ) if r.status_code == 200 and kwargs.get( 'cache') == 'always' and environ.get('HTTP_REFERER'): response.status_headers[ 'Cache-Control'] = 'public, max-age=31536000, immutable' return response def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy, timegate_closest_ts=None): memento_ts = None if not isinstance(response, WbResponse): content_type = 'text/html' # if not replay outer frame, specify utf-8 charset if not self.is_framed_replay(wb_url): content_type += '; charset=utf-8' else: memento_ts = timegate_closest_ts or wb_url.timestamp response = WbResponse.text_response(response, content_type=content_type) if self.enable_memento and response.status_headers.statusline.startswith( '200'): self._add_memento_links(wb_url.url, full_prefix, None, memento_ts, response.status_headers, is_timegate, is_proxy, is_memento=not is_timegate) return response def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts, status_headers, is_timegate, is_proxy, coll=None, pref_applied=None, mod=None, is_memento=True): """Adds the memento link headers to supplied StatusAndHeaders instance :param str url: The URI-R being rewritten :param str full_prefix: The replay prefix :param str|None memento_dt: The memento datetime for the URI-R being rewritten :param str memento_ts: The memento timestamp :param warcio.StatusAndHeaders status_headers: :param bool is_timegate: Are we returning a response for a timegate :param bool is_proxy: Are we operating in proxy mode :param str|None coll: The collection the URI-R is from :param str|None pref_applied: :param str|None mod: The rewrite modifier :param bool is_memento: :rtype: None """ replay_mod = mod or self.replay_mod # memento url + header if not memento_dt and memento_ts: memento_dt = timestamp_to_http_date(memento_ts) if memento_dt: if is_memento: status_headers.headers.append(('Memento-Datetime', memento_dt)) if is_proxy: memento_url = url else: memento_url = full_prefix + memento_ts + replay_mod memento_url += '/' + url else: memento_url = None timegate_url, timemap_url = self._get_timegate_timemap( url, full_prefix, mod) link = [] if not is_proxy: link.append(MementoUtils.make_link(url, 'original')) link.append(MementoUtils.make_link(timegate_url, 'timegate')) link.append(MementoUtils.make_link(timemap_url, 'timemap')) if memento_dt: link.append( MementoUtils.make_memento_link(memento_url, 'memento', memento_dt, coll)) link_str = ', '.join(link) status_headers.headers.append(('Link', link_str)) vary = '' if is_timegate: vary = 'accept-datetime' if pref_applied: vary = 'Prefer' if not vary else vary + ', Prefer' status_headers.headers.append(('Preference-Applied', pref_applied)) if vary: status_headers.headers.append(('Vary', vary)) def _get_timegate_timemap(self, url, full_prefix, mod): # timegate url timegate_url = full_prefix mod = '' if mod: timegate_url += mod + '/' timegate_url += url # timemap url timemap_url = full_prefix + 'timemap/link/' + url return timegate_url, timemap_url def get_top_url(self, full_prefix, wb_url, cdx, kwargs): top_url = full_prefix + wb_url.to_str(mod='') return top_url def handle_error(self, environ, wbe): if isinstance(wbe, NotFoundException): return self._not_found_response(environ, wbe.url) else: return self._error_response(environ, wbe) def _not_found_response(self, environ, url): resp = self.not_found_view.render_to_string(environ, url=url, err_msg="Not Found") return WbResponse.text_response(resp, status='404 Not Found', content_type='text/html') def _error_response(self, environ, wbe): status = wbe.status() resp = self.error_view.render_to_string(environ, err_msg=wbe.url, err_details=wbe.msg, err_status=wbe.status_code) return WbResponse.text_response(resp, status=status, content_type='text/html') def _do_req(self, inputreq, wb_url, kwargs, skip_record): req_data = inputreq.reconstruct_request(wb_url.url) headers = { 'Content-Length': str(len(req_data)), 'Content-Type': 'application/request' } headers.update(inputreq.warcserver_headers) if skip_record: headers['Recorder-Skip'] = '1' if wb_url.is_latest_replay(): closest = 'now' else: closest = wb_url.timestamp params = {'url': wb_url.url, 'closest': closest, 'matchType': 'exact'} if wb_url.mod == 'vi_': params['content_type'] = self.VIDEO_INFO_CONTENT_TYPE upstream_url = self.get_upstream_url(wb_url, kwargs, params) r = requests.post(upstream_url, data=BytesIO(req_data), headers=headers, stream=True) return r def do_query(self, wb_url, kwargs): """Performs the timemap query request for the supplied WbUrl returning the response :param WbUrl wb_url: The WbUrl to be queried :param dict kwargs: Optional keyword arguments :return: The queries response :rtype: requests.Response """ params = { 'url': wb_url.url, 'output': kwargs.get('output', 'json'), 'from': wb_url.timestamp, 'to': wb_url.end_timestamp } if 'memento_format' in kwargs: params['memento_format'] = kwargs['memento_format'] if 'limit' in kwargs: params['limit'] = kwargs['limit'] upstream_url = self.get_upstream_url(wb_url, kwargs, params) upstream_url = upstream_url.replace('/resource/postreq', '/index') r = requests.get(upstream_url) return r def make_timemap(self, wb_url, res, full_prefix, output): wb_url.type = wb_url.QUERY content_type = res.headers.get('Content-Type') text = res.text if not res.text: status = '404 Not Found' elif res.status_code: status = str(res.status_code) + ' ' + res.reason if res.status_code == 200 and output == 'link': timegate, timemap = self._get_timegate_timemap( wb_url.url, full_prefix, wb_url.mod) text = MementoUtils.wrap_timemap_header( wb_url.url, timegate, timemap, res.text) return WbResponse.text_response(text, content_type=content_type, status=status) def handle_timemap(self, wb_url, kwargs, full_prefix): output = kwargs.get('output') kwargs[ 'memento_format'] = full_prefix + '{timestamp}' + self.replay_mod + '/{url}' res = self.do_query(wb_url, kwargs) return self.make_timemap(wb_url, res, full_prefix, output) def handle_query(self, environ, wb_url, kwargs, full_prefix): prefix = self.get_full_prefix(environ) params = dict(url=wb_url.url, prefix=prefix) return self.query_view.render_to_string(environ, **params) def get_host_prefix(self, environ): scheme = environ['wsgi.url_scheme'] + '://' # proxy host = environ.get('wsgiprox.proxy_host') if host: return scheme + host # default host = environ.get('HTTP_HOST') if host: return scheme + host # if no host host = environ['SERVER_NAME'] if environ['wsgi.url_scheme'] == 'https': if environ['SERVER_PORT'] != '443': host += ':' + environ['SERVER_PORT'] else: if environ['SERVER_PORT'] != '80': host += ':' + environ['SERVER_PORT'] return scheme + host def get_rel_prefix(self, environ): # return request.script_name return environ.get('SCRIPT_NAME') + '/' def get_full_prefix(self, environ): return self.get_host_prefix(environ) + self.get_rel_prefix(environ) def unrewrite_referrer(self, environ, full_prefix): referrer = environ.get('HTTP_REFERER') if not referrer: return False if referrer.startswith(full_prefix): referrer = referrer[len(full_prefix):] if referrer: environ['HTTP_REFERER'] = WbUrl(referrer).url return True return False def is_ajax(self, environ): value = environ.get('HTTP_X_REQUESTED_WITH') value = value or environ.get('HTTP_X_PYWB_REQUESTED_WITH') if value and value.lower() == 'xmlhttprequest': return True # additional checks for proxy mode only if not ('wsgiprox.proxy_host' in environ): return False # if Chrome Sec-Fetch-Mode is set and is set to 'cors', then # a fetch / ajax request sec_fetch_mode = environ.get('HTTP_SEC_FETCH_MODE') if sec_fetch_mode and sec_fetch_mode == 'cors': return True return False def is_preflight(self, environ): if environ.get('REQUEST_METHOD') != 'OPTIONS': return False if not environ.get('HTTP_ORIGIN'): return False if not environ.get( 'HTTP_ACCESS_CONTROL_REQUEST_METHOD') and not environ.get( 'HTTP_ACCESS_CONTROL_REQUEST_HEADERS'): return False return True def get_base_url(self, wb_url, kwargs): type_ = kwargs.get('type') return self.paths[type_].format(**kwargs) def get_upstream_url(self, wb_url, kwargs, params): base_url = self.get_base_url(wb_url, kwargs) param_str = urlencode(params, True) if param_str: q_char = '&' if '?' in base_url else '?' base_url += q_char + param_str return base_url def get_cookie_key(self, kwargs): # note: currently this is per-collection, so enabled only for live or recording # to support multiple users recording/live, would need per user cookie if kwargs.get('index') == '$live' or kwargs.get('type') == 'record': return 'cookie:' + kwargs['coll'] else: return None def _add_history_page(self, cdx, kwargs, doc_title): pass def _add_custom_params(self, cdx, headers, kwargs, record): pass def get_top_frame_params(self, wb_url, kwargs): return {'metadata': kwargs.get('metadata', {})} def handle_custom_response(self, environ, wb_url, full_prefix, host_prefix, kwargs): if self.is_framed_replay(wb_url): extra_params = self.get_top_frame_params(wb_url, kwargs) return self.frame_insert_view.get_top_frame( wb_url, full_prefix, host_prefix, environ, self.frame_mod, self.replay_mod, coll='', extra_params=extra_params) return None
def __init__(self, framed_replay=False, jinja_env=None, config=None, paths=None): """Initialize a new instance of RewriterApp :param bool framed_replay: Is rewriting happening in framed replay mode :param JinjaEnv|None jinja_env: Optional JinjaEnv instance to be used for rendering static files :param dict|None config: Optional config dictionary :param dict|None paths: Optional dictionary containing a mapping of path names to URLs """ self.loader = ArcWarcRecordLoader() self.config = config or {} self.paths = paths or {} self.framed_replay = framed_replay if framed_replay: self.frame_mod = '' self.replay_mod = 'mp_' else: self.frame_mod = None self.replay_mod = '' self.enable_prefer = self.config.get('enable_prefer', False) self.default_rw = DefaultRewriter(replay_mod=self.replay_mod, config=config) self.js_proxy_rw = RewriterWithJSProxy(replay_mod=self.replay_mod) if not jinja_env: jinja_env = JinjaEnv( globals={'static_path': 'static'}, extensions=['jinja2.ext.i18n', 'jinja2.ext.with_']) jinja_env.jinja_env.install_null_translations() self.jinja_env = jinja_env self.loc_map = {} self.jinja_env.init_loc(self.config.get('locales_root_dir'), self.config.get('locales'), self.loc_map, self.config.get('default_locale')) self.redirect_to_exact = config.get('redirect_to_exact') self.banner_view = BaseInsertView(self.jinja_env, self._html_templ('banner_html')) self.head_insert_view = HeadInsertView( self.jinja_env, self._html_templ('head_insert_html'), self.banner_view) self.frame_insert_view = TopFrameView( self.jinja_env, self._html_templ('frame_insert_html'), self.banner_view) self.error_view = BaseInsertView(self.jinja_env, self._html_templ('error_html')) self.not_found_view = BaseInsertView( self.jinja_env, self._html_templ('not_found_html')) self.query_view = BaseInsertView(self.jinja_env, self._html_templ('query_html')) self.use_js_obj_proxy = config.get('use_js_obj_proxy', True) self.cookie_tracker = self._init_cookie_tracker() self.enable_memento = self.config.get('enable_memento') self.static_prefix = self.config.get('static_prefix', 'static') csp_header = self.config.get('csp-header', self.DEFAULT_CSP) if csp_header: self.csp_header = ('Content-Security-Policy', csp_header) else: self.csp_header = None # deprecated: Use X-Forwarded-Proto header instead! self.force_scheme = config.get('force_scheme')
def parse(row): record = ArcWarcRecordLoader() record = record.parse_record_stream(StringIO(row), known_format="warc") return {"warc": record, "raw": str(row)}
def parse_stream_error(**params): try: return ArcWarcRecordLoader().parse_record_stream(**params) except Exception as e: print('Exception: ' + e.__class__.__name__)
class ArchiveIterator(six.Iterator): """ Iterate over records in WARC and ARC files, both gzip chunk compressed and uncompressed The indexer will automatically detect format, and decompress if necessary. """ GZIP_ERR_MSG = """ ERROR: non-chunked gzip file detected, gzip block continues beyond single record. This file is probably not a multi-member gzip but a single gzip file. To allow seek, a gzipped {1} must have each record compressed into a single gzip member and concatenated together. This file is likely still valid and can be fixed by running: warcio recompress <path/to/file> <path/to/new_file> """ INC_RECORD = """\ WARNING: Record not followed by newline, perhaps Content-Length is invalid Offset: {0} Remainder: {1} """ def __init__(self, fileobj, no_record_parse=False, verify_http=False, arc2warc=False, ensure_http_headers=False, block_size=BUFF_SIZE): self.fh = fileobj self.loader = ArcWarcRecordLoader(verify_http=verify_http, arc2warc=arc2warc) self.known_format = None self.mixed_arc_warc = arc2warc self.member_info = None self.no_record_parse = no_record_parse self.ensure_http_headers = ensure_http_headers self.reader = DecompressingBufferedReader(self.fh, block_size=block_size) self.offset = self.fh.tell() self.next_line = None self.err_count = 0 self.the_iter = self._iterate_records() def __iter__(self): return self.the_iter def __next__(self): return six.next(self.the_iter) def _iterate_records(self): """ iterate over each record """ raise_invalid_gzip = False empty_record = False record = None while True: try: record = self._next_record(self.next_line) if raise_invalid_gzip: self._raise_invalid_gzip_err() yield record except EOFError: empty_record = True if record: self.read_to_end(record) if self.reader.decompressor: # if another gzip member, continue if self.reader.read_next_member(): continue # if empty record, then we're done elif empty_record: break # otherwise, probably a gzip # containing multiple non-chunked records # raise this as an error else: raise_invalid_gzip = True # non-gzip, so we're done elif empty_record: break def _raise_invalid_gzip_err(self): """ A gzip file with multiple ARC/WARC records, non-chunked has been detected. This is not valid for replay, so notify user """ frmt = 'warc/arc' if self.known_format: frmt = self.known_format frmt_up = frmt.upper() msg = self.GZIP_ERR_MSG.format(frmt, frmt_up) raise ArchiveLoadFailed(msg) def _consume_blanklines(self): """ Consume blank lines that are between records - For warcs, there are usually 2 - For arcs, may be 1 or 0 - For block gzipped files, these are at end of each gzip envelope and are included in record length which is the full gzip envelope - For uncompressed, they are between records and so are NOT part of the record length count empty_size so that it can be substracted from the record length for uncompressed if first line read is not blank, likely error in WARC/ARC, display a warning """ empty_size = 0 first_line = True while True: line = self.reader.readline() if len(line) == 0: return None, empty_size stripped = line.rstrip() if len(stripped) == 0 or first_line: empty_size += len(line) if len(stripped) != 0: # if first line is not blank, # likely content-length was invalid, display warning err_offset = self.fh.tell() - self.reader.rem_length() - empty_size sys.stderr.write(self.INC_RECORD.format(err_offset, line)) self.err_count += 1 first_line = False continue return line, empty_size def read_to_end(self, record): """ Read remainder of the stream If a digester is included, update it with the data read """ # already at end of this record, don't read until it is consumed if self.member_info: return None curr_offset = self.offset while True: b = record.raw_stream.read(BUFF_SIZE) if not b: break """ - For compressed files, blank lines are consumed since they are part of record length - For uncompressed files, blank lines are read later, and not included in the record length """ #if self.reader.decompressor: self.next_line, empty_size = self._consume_blanklines() self.offset = self.fh.tell() - self.reader.rem_length() #if self.offset < 0: # raise Exception('Not Gzipped Properly') if self.next_line: self.offset -= len(self.next_line) length = self.offset - curr_offset if not self.reader.decompressor: length -= empty_size self.member_info = (curr_offset, length) #return self.member_info #return next_line def _next_record(self, next_line): """ Use loader to parse the record from the reader stream Supporting warc and arc records """ record = self.loader.parse_record_stream(self.reader, next_line, self.known_format, self.no_record_parse, self.ensure_http_headers) self.member_info = None # Track known format for faster parsing of other records if not self.mixed_arc_warc: self.known_format = record.format return record
class RewriterApp(object): VIDEO_INFO_CONTENT_TYPE = 'application/vnd.youtube-dl_formats+json' DEFAULT_CSP = "default-src 'unsafe-eval' 'unsafe-inline' 'self' data: blob: mediastream: ws: wss: ; form-action 'self'" def __init__(self, framed_replay=False, jinja_env=None, config=None, paths=None): self.loader = ArcWarcRecordLoader() self.config = config or {} self.paths = paths or {} self.framed_replay = framed_replay if framed_replay: self.frame_mod = '' self.replay_mod = 'mp_' else: self.frame_mod = None self.replay_mod = '' self.default_rw = DefaultRewriter(replay_mod=self.replay_mod, config=config) self.js_proxy_rw = RewriterWithJSProxy(replay_mod=self.replay_mod) if not jinja_env: jinja_env = JinjaEnv(globals={'static_path': 'static'}) self.jinja_env = jinja_env self.redirect_to_exact = config.get('redirect_to_exact') self.banner_view = BaseInsertView(self.jinja_env, self._html_templ('banner_html')) self.head_insert_view = HeadInsertView(self.jinja_env, self._html_templ('head_insert_html'), self.banner_view) self.frame_insert_view = TopFrameView(self.jinja_env, self._html_templ('frame_insert_html'), self.banner_view) self.error_view = BaseInsertView(self.jinja_env, self._html_templ('error_html')) self.not_found_view = BaseInsertView(self.jinja_env, self._html_templ('not_found_html')) self.query_view = BaseInsertView(self.jinja_env, self._html_templ('query_html')) self.use_js_obj_proxy = config.get('use_js_obj_proxy', True) self.cookie_tracker = None self.enable_memento = self.config.get('enable_memento') csp_header = self.config.get('csp-header', self.DEFAULT_CSP) if csp_header: self.csp_header = ('Content-Security-Policy', csp_header) else: self.csp_header = None def add_csp_header(self, wb_url, status_headers): if self.csp_header and wb_url.mod == self.replay_mod: status_headers.headers.append(self.csp_header) def _html_templ(self, name): value = self.config.get(name) if not value: value = name.replace('_html', '.html') return value def is_framed_replay(self, wb_url): return (self.framed_replay and wb_url.mod == self.frame_mod and wb_url.is_replay()) def _check_accept_dt(self, wb_url, environ): is_timegate = False if wb_url.is_latest_replay(): accept_dt = environ.get('HTTP_ACCEPT_DATETIME') is_timegate = True if accept_dt: try: wb_url.timestamp = http_date_to_timestamp(accept_dt) except: raise UpstreamException(400, url=wb_url.url, details='Invalid Accept-Datetime') #return WbResponse.text_response('Invalid Accept-Datetime', status='400 Bad Request') wb_url.type = wb_url.REPLAY return is_timegate def _check_range(self, inputreq, wb_url): skip_record = False range_start = None range_end = None rangeres = inputreq.extract_range() if not rangeres: return range_start, range_end, skip_record mod_url, start, end, use_206 = rangeres # remove the range and still proxy if not use_206: return range_start, range_end, skip_record wb_url.url = mod_url inputreq.url = mod_url range_start = start range_end = end #if start with 0, load from upstream, but add range after if start == 0: del inputreq.env['HTTP_RANGE'] else: skip_record = True return range_start, range_end, skip_record def _add_range(self, record, wb_url, range_start, range_end): if range_end is None and range_start is None: return if record.http_headers.get_statuscode() != '200': return content_length = (record.http_headers. get_header('Content-Length')) try: content_length = int(content_length) if not range_end: range_end = content_length - 1 if range_start >= content_length or range_end >= content_length: details = 'Invalid Range: {0} >= {2} or {1} >= {2}'.format(range_start, range_end, content_length) try: r.raw.close() except: pass raise UpstreamException(416, url=wb_url.url, details=details) range_len = range_end - range_start + 1 record.http_headers.add_range(range_start, range_len, content_length) record.http_headers.replace_header('Content-Length', str(range_len)) record.raw_stream = OffsetLimitReader(record.raw_stream, range_start, range_len) return True except (ValueError, TypeError): pass def render_content(self, wb_url, kwargs, environ): wb_url = wb_url.replace('#', '%23') wb_url = WbUrl(wb_url) is_timegate = self._check_accept_dt(wb_url, environ) host_prefix = self.get_host_prefix(environ) rel_prefix = self.get_rel_prefix(environ) full_prefix = host_prefix + rel_prefix is_proxy = ('wsgiprox.proxy_host' in environ) response = self.handle_custom_response(environ, wb_url, full_prefix, host_prefix, kwargs) if response: return self.format_response(response, wb_url, full_prefix, is_timegate, is_proxy) if is_proxy: environ['pywb_proxy_magic'] = environ['wsgiprox.proxy_host'] urlrewriter = IdentityUrlRewriter(wb_url, '') framed_replay = False else: urlrewriter = UrlRewriter(wb_url, prefix=full_prefix, full_prefix=full_prefix, rel_prefix=rel_prefix) framed_replay = self.framed_replay url_parts = urlsplit(wb_url.url) if not url_parts.path: scheme, netloc, path, query, frag = url_parts path = '/' url = urlunsplit((scheme, netloc, path, query, frag)) resp = WbResponse.redir_response(urlrewriter.rewrite(url), '307 Temporary Redirect') if self.enable_memento: resp.status_headers['Link'] = MementoUtils.make_link(url, 'original') return resp self.unrewrite_referrer(environ, full_prefix) urlkey = canonicalize(wb_url.url) if self.use_js_obj_proxy: content_rw = self.js_proxy_rw else: content_rw = self.default_rw inputreq = RewriteInputRequest(environ, urlkey, wb_url.url, content_rw) inputreq.include_method_query(wb_url.url) range_start, range_end, skip_record = self._check_range(inputreq, wb_url) setcookie_headers = None if self.cookie_tracker: cookie_key = self.get_cookie_key(kwargs) res = self.cookie_tracker.get_cookie_headers(wb_url.url, urlrewriter, cookie_key) inputreq.extra_cookie, setcookie_headers = res r = self._do_req(inputreq, wb_url, kwargs, skip_record) if r.status_code >= 400: error = None try: error = r.raw.read() r.raw.close() except: pass if error: error = error.decode('utf-8') else: error = '' details = dict(args=kwargs, error=error) raise UpstreamException(r.status_code, url=wb_url.url, details=details) stream = BufferedReader(r.raw, block_size=BUFF_SIZE) record = self.loader.parse_record_stream(stream, ensure_http_headers=True) memento_dt = r.headers.get('Memento-Datetime') target_uri = r.headers.get('WARC-Target-URI') cdx = CDXObject(r.headers.get('Warcserver-Cdx').encode('utf-8')) #cdx['urlkey'] = urlkey #cdx['timestamp'] = http_date_to_timestamp(memento_dt) #cdx['url'] = target_uri set_content_loc = False # Check if Fuzzy Match if target_uri != wb_url.url and cdx.get('is_fuzzy') == '1': set_content_loc = True # if redir to exact, redir if url or ts are different if self.redirect_to_exact: if (set_content_loc or (wb_url.timestamp != cdx.get('timestamp') and not cdx.get('is_live'))): new_url = urlrewriter.get_new_url(url=target_uri, timestamp=cdx['timestamp'], mod=wb_url.mod) resp = WbResponse.redir_response(new_url, '307 Temporary Redirect') if self.enable_memento: if is_timegate and not is_proxy: self._add_memento_links(target_uri, full_prefix, memento_dt, cdx['timestamp'], resp.status_headers, is_timegate, is_proxy) else: resp.status_headers['Link'] = MementoUtils.make_link(target_uri, 'original') return resp self._add_custom_params(cdx, r.headers, kwargs) if self._add_range(record, wb_url, range_start, range_end): wb_url.mod = 'id_' is_ajax = self.is_ajax(environ) if is_ajax: head_insert_func = None urlrewriter.rewrite_opts['is_ajax'] = True else: top_url = self.get_top_url(full_prefix, wb_url, cdx, kwargs) head_insert_func = (self.head_insert_view. create_insert_func(wb_url, full_prefix, host_prefix, top_url, environ, framed_replay, config=self.config)) cookie_rewriter = None if self.cookie_tracker: cookie_rewriter = self.cookie_tracker.get_rewriter(urlrewriter, cookie_key) urlrewriter.rewrite_opts['ua_string'] = environ.get('HTTP_USER_AGENT') result = content_rw(record, urlrewriter, cookie_rewriter, head_insert_func, cdx) status_headers, gen, is_rw = result if setcookie_headers: status_headers.headers.extend(setcookie_headers) if ' ' not in status_headers.statusline: status_headers.statusline += ' None' if not is_ajax and self.enable_memento: self._add_memento_links(cdx['url'], full_prefix, memento_dt, cdx['timestamp'], status_headers, is_timegate, is_proxy, cdx.get('source-coll')) set_content_loc = True if set_content_loc and not self.redirect_to_exact: status_headers.headers.append(('Content-Location', urlrewriter.get_new_url(timestamp=cdx['timestamp'], url=cdx['url']))) if not is_proxy: self.add_csp_header(wb_url, status_headers) response = WbResponse(status_headers, gen) return response def format_response(self, response, wb_url, full_prefix, is_timegate, is_proxy): memento_ts = None if not isinstance(response, WbResponse): content_type = 'text/html' # if not replay outer frame, specify utf-8 charset if not self.is_framed_replay(wb_url): content_type += '; charset=utf-8' else: memento_ts = wb_url.timestamp response = WbResponse.text_response(response, content_type=content_type) if self.enable_memento: self._add_memento_links(wb_url.url, full_prefix, None, memento_ts, response.status_headers, is_timegate, is_proxy) return response def _add_memento_links(self, url, full_prefix, memento_dt, memento_ts, status_headers, is_timegate, is_proxy, coll=None): # memento url + header if not memento_dt and memento_ts: memento_dt = timestamp_to_http_date(memento_ts) if memento_dt: status_headers.headers.append(('Memento-Datetime', memento_dt)) if is_proxy: memento_url = url else: memento_url = full_prefix + memento_ts + self.replay_mod memento_url += '/' + url else: memento_url = None timegate_url, timemap_url = self._get_timegate_timemap(url, full_prefix) link = [] if not is_proxy: link.append(MementoUtils.make_link(url, 'original')) link.append(MementoUtils.make_link(timegate_url, 'timegate')) link.append(MementoUtils.make_link(timemap_url, 'timemap')) if memento_dt: link.append(MementoUtils.make_memento_link(memento_url, 'memento', memento_dt, coll)) link_str = ', '.join(link) status_headers.headers.append(('Link', link_str)) if is_timegate: status_headers.headers.append(('Vary', 'accept-datetime')) def _get_timegate_timemap(self, url, full_prefix): # timegate url timegate_url = full_prefix if self.replay_mod: timegate_url += self.replay_mod + '/' timegate_url += url # timemap url timemap_url = full_prefix + 'timemap/link/' + url return timegate_url, timemap_url def get_top_url(self, full_prefix, wb_url, cdx, kwargs): top_url = full_prefix top_url += wb_url.to_str(mod='') return top_url def handle_error(self, environ, ue): if ue.status_code == 404: return self._not_found_response(environ, ue.url) else: status = str(ue.status_code) + ' ' + HTTP_STATUS_CODES.get(ue.status_code, 'Unknown Error') return self._error_response(environ, ue.url, ue.msg, status=status) def _not_found_response(self, environ, url): resp = self.not_found_view.render_to_string(environ, url=url) return WbResponse.text_response(resp, status='404 Not Found', content_type='text/html') def _error_response(self, environ, msg='', details='', status='404 Not Found'): resp = self.error_view.render_to_string(environ, err_msg=msg, err_details=details) return WbResponse.text_response(resp, status=status, content_type='text/html') def _do_req(self, inputreq, wb_url, kwargs, skip_record): req_data = inputreq.reconstruct_request(wb_url.url) headers = {'Content-Length': str(len(req_data)), 'Content-Type': 'application/request'} if skip_record: headers['Recorder-Skip'] = '1' if wb_url.is_latest_replay(): closest = 'now' else: closest = wb_url.timestamp params = {} params['url'] = wb_url.url params['closest'] = closest params['matchType'] = 'exact' if wb_url.mod == 'vi_': params['content_type'] = self.VIDEO_INFO_CONTENT_TYPE upstream_url = self.get_upstream_url(wb_url, kwargs, params) r = requests.post(upstream_url, data=BytesIO(req_data), headers=headers, stream=True) return r def do_query(self, wb_url, kwargs): params = {} params['url'] = wb_url.url params['output'] = kwargs.get('output', 'json') params['from'] = wb_url.timestamp params['to'] = wb_url.end_timestamp upstream_url = self.get_upstream_url(wb_url, kwargs, params) upstream_url = upstream_url.replace('/resource/postreq', '/index') r = requests.get(upstream_url) return r def make_timemap(self, wb_url, res, full_prefix, output): wb_url.type = wb_url.QUERY content_type = res.headers.get('Content-Type') text = res.text if not res.text: status = '404 Not Found' elif res.status_code: status = str(res.status_code) + ' ' + res.reason if res.status_code == 200 and output == 'link': timegate, timemap = self._get_timegate_timemap(wb_url.url, full_prefix) text = MementoUtils.wrap_timemap_header(wb_url.url, timegate, timemap, res.text) return WbResponse.text_response(text, content_type=content_type, status=status) def handle_timemap(self, wb_url, kwargs, full_prefix): output = kwargs.get('output') res = self.do_query(wb_url, kwargs) return self.make_timemap(wb_url, res, full_prefix, output) def handle_query(self, environ, wb_url, kwargs, full_prefix): prefix = self.get_full_prefix(environ) params = dict(url=wb_url.url, prefix=prefix) return self.query_view.render_to_string(environ, **params) def get_host_prefix(self, environ): scheme = environ['wsgi.url_scheme'] + '://' # proxy host = environ.get('wsgiprox.proxy_host') if host: return scheme + host # default host = environ.get('HTTP_HOST') if host: return scheme + host # if no host host = environ['SERVER_NAME'] if environ['wsgi.url_scheme'] == 'https': if environ['SERVER_PORT'] != '443': host += ':' + environ['SERVER_PORT'] else: if environ['SERVER_PORT'] != '80': host += ':' + environ['SERVER_PORT'] return scheme + host def get_rel_prefix(self, environ): #return request.script_name return environ.get('SCRIPT_NAME') + '/' def get_full_prefix(self, environ): return self.get_host_prefix(environ) + self.get_rel_prefix(environ) def unrewrite_referrer(self, environ, full_prefix): referrer = environ.get('HTTP_REFERER') if not referrer: return False if referrer.startswith(full_prefix): referrer = referrer[len(full_prefix):] if referrer: environ['HTTP_REFERER'] = WbUrl(referrer).url return True return False def is_ajax(self, environ): value = environ.get('HTTP_X_REQUESTED_WITH') value = value or environ.get('HTTP_X_PYWB_REQUESTED_WITH') if value and value.lower() == 'xmlhttprequest': return True return False def get_base_url(self, wb_url, kwargs): type = kwargs.get('type') return self.paths[type].format(**kwargs) def get_upstream_url(self, wb_url, kwargs, params): base_url = self.get_base_url(wb_url, kwargs) param_str = urlencode(params, True) if param_str: q_char = '&' if '?' in base_url else '?' base_url += q_char + param_str return base_url def get_cookie_key(self, kwargs): raise NotImplemented() def _add_custom_params(self, cdx, headers, kwargs): pass def get_top_frame_params(self, wb_url, kwargs): return None def handle_custom_response(self, environ, wb_url, full_prefix, host_prefix, kwargs): if kwargs.get('output'): return self.handle_timemap(wb_url, kwargs, full_prefix) if wb_url.is_query(): return self.handle_query(environ, wb_url, kwargs, full_prefix) if self.is_framed_replay(wb_url): extra_params = self.get_top_frame_params(wb_url, kwargs) return self.frame_insert_view.get_top_frame(wb_url, full_prefix, host_prefix, environ, self.frame_mod, self.replay_mod, coll='', extra_params=extra_params) return None