def parse_mem_value(self, m): iso = m['datetime'] dt = iso_date_to_datetime(iso) sec = datetime_to_secs(dt) ts = datetime_to_timestamp(dt) url = m['uri'] return MemValue(ts, sec, url)
def fetch_request(self, url, urlrewriter, head_insert_func=None, urlkey=None, env=None, req_headers={}, timestamp=None, follow_redirects=False, proxies=None): ts_err = url.split('///') # fixup for accidental erroneous rewrite which has /// # (unless file:///) if len(ts_err) > 1 and ts_err[0] != 'file:': url = 'http://' + ts_err[1] if url.startswith('//'): url = 'http:' + url if is_http(url): (status_headers, stream) = self.fetch_http(url, env, req_headers, follow_redirects, proxies) else: (status_headers, stream) = self.fetch_local_file(url) # explicit urlkey may be passed in (say for testing) if not urlkey: urlkey = canonicalize(url) if timestamp is None: timestamp = datetime_to_timestamp(datetime.datetime.utcnow()) cdx = { 'urlkey': urlkey, 'timestamp': timestamp, 'original': url, 'statuscode': status_headers.get_statuscode(), 'mimetype': status_headers.get_header('Content-Type'), 'is_live': True, } result = (self.rewriter.rewrite_content( urlrewriter, status_headers, stream, head_insert_func=head_insert_func, urlkey=urlkey, cdx=cdx)) if env: env['pywb.cdx'] = cdx return result
def fetch_request(self, url, urlrewriter, head_insert_func=None, urlkey=None, env=None, req_headers={}, timestamp=None, follow_redirects=False, proxies=None): ts_err = url.split('///') # fixup for accidental erroneous rewrite which has /// # (unless file:///) if len(ts_err) > 1 and ts_err[0] != 'file:': url = 'http://' + ts_err[1] if url.startswith('//'): url = 'http:' + url if is_http(url): (status_headers, stream) = self.fetch_http(url, env, req_headers, follow_redirects, proxies) else: (status_headers, stream) = self.fetch_local_file(url) # explicit urlkey may be passed in (say for testing) if not urlkey: urlkey = canonicalize(url) if timestamp is None: timestamp = datetime_to_timestamp(datetime.datetime.utcnow()) cdx = {'urlkey': urlkey, 'timestamp': timestamp, 'original': url, 'statuscode': status_headers.get_statuscode(), 'mimetype': status_headers.get_header('Content-Type'), 'is_live': True, } result = (self.rewriter. rewrite_content(urlrewriter, status_headers, stream, head_insert_func=head_insert_func, urlkey=urlkey, cdx=cdx)) if env: env['pywb.cdx'] = cdx return result
def get_top_frame_params(self, wbrequest, mod): embed_url = wbrequest.wb_url.to_str(mod=mod) if wbrequest.wb_url.timestamp: timestamp = wbrequest.wb_url.timestamp else: timestamp = datetime_to_timestamp(datetime.utcnow()) params = dict(embed_url=embed_url, wbrequest=wbrequest, timestamp=timestamp, url=wbrequest.wb_url.get_url(), banner_html=self.banner_html) return params
def snapshot(self): coll = request.query.get('coll', '') if coll == '@anon': user = self.manager.get_anon_user() else: user, coll = self.path_parser.get_user_coll(coll) url = request.query.get('url', '') if not url or not self.manager.can_write_coll(user, coll): raise HTTPError(status=404, body='No Such Page') title = request.query.get('title', '') add_page = request.query.get('addpage', False) html_text = request.body.read() #host = get_host() host = WbRequest.make_host_prefix(request.environ) prefix = request.query.get('prefix', host) orig_html = HTMLDomUnRewriter.unrewrite_html(host, prefix, html_text) dt = datetime.utcnow() sesh_id = self.path_parser.get_coll_path(user, coll) target = dict(output_dir=self.path_parser.get_archive_dir(user, coll), sesh_id=sesh_id.replace('/', ':'), user_id=user, name_prefix=self.path_parser.get_name_prefix(user, coll), json_metadata={'snapshot': 'html', 'timestamp': str(dt)}, writer_type='-snapshot') if url.startswith('https://'): url = url.replace('https:', 'http:') req_headers = {'warcprox-meta': json.dumps(target), 'content-type': 'text/html', 'user-agent': request.headers.get('user-agent') } pagedata = {'url': url, 'title': title, 'tags': ['snapshot'], 'ts': datetime_to_timestamp(dt) } try: resp = requests.request(method='PUTRES', url=url, data=orig_html, headers=req_headers, proxies=self.warcprox_proxies, verify=False) if add_page: self.manager.add_page(user, coll, pagedata) except: return {'status': 'err'} return {'status': resp.status_code}
def load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: return None if params.get('content_type') == VideoLoader.CONTENT_TYPE: return None input_req = params['_input_req'] req_headers = input_req.get_req_headers() dt = timestamp_to_datetime(cdx['timestamp']) if cdx.get('memento_url'): req_headers['Accept-Datetime'] = datetime_to_http_date(dt) method = input_req.get_req_method() data = input_req.get_req_body() p = PreparedRequest() p.prepare_url(load_url, None) p.prepare_headers(None) p.prepare_auth(None, load_url) auth = p.headers.get('Authorization') if auth: req_headers['Authorization'] = auth load_url = p.url try: upstream_res = self.pool.urlopen(method=method, url=load_url, body=data, headers=req_headers, redirect=False, assert_same_host=False, preload_content=False, decode_content=False, retries=self.num_retries, timeout=params.get('_timeout')) except Exception as e: raise LiveResourceException(load_url) memento_dt = upstream_res.headers.get('Memento-Datetime') if memento_dt: dt = http_date_to_datetime(memento_dt) cdx['timestamp'] = datetime_to_timestamp(dt) elif cdx.get('memento_url'): # if 'memento_url' set and no Memento-Datetime header present # then its an error return None agg_type = upstream_res.headers.get('WebAgg-Type') if agg_type == 'warc': cdx['source'] = unquote(upstream_res.headers.get('WebAgg-Source-Coll')) return None, upstream_res.headers, upstream_res self.raise_on_self_redirect(params, cdx, str(upstream_res.status), upstream_res.headers.get('Location')) if upstream_res.version == 11: version = '1.1' else: version = '1.0' status = 'HTTP/{version} {status} {reason}\r\n' status = status.format(version=version, status=upstream_res.status, reason=upstream_res.reason) http_headers_buff = status orig_resp = upstream_res._original_response try: #pragma: no cover #PY 3 resp_headers = orig_resp.headers._headers for n, v in resp_headers: if n.lower() in self.SKIP_HEADERS: continue http_headers_buff += n + ': ' + v + '\r\n' except: #pragma: no cover #PY 2 resp_headers = orig_resp.msg.headers for n, v in zip(orig_resp.getheaders(), resp_headers): if n in self.SKIP_HEADERS: continue http_headers_buff += v http_headers_buff += '\r\n' http_headers_buff = http_headers_buff.encode('latin-1') try: fp = upstream_res._fp.fp if hasattr(fp, 'raw'): #pragma: no cover fp = fp.raw remote_ip = fp._sock.getpeername()[0] except: #pragma: no cover remote_ip = None warc_headers = {} warc_headers['WARC-Type'] = 'response' warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Target-URI'] = cdx['url'] warc_headers['WARC-Date'] = datetime_to_iso_date(dt) if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip warc_headers['Content-Type'] = 'application/http; msgtype=response' self._set_content_len(upstream_res.headers.get('Content-Length', -1), warc_headers, len(http_headers_buff)) warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) return (warc_headers, http_headers_buff, upstream_res)
def snapshot(self): coll = request.query.get('coll', '') if coll == '@anon': user = self.manager.get_anon_user() else: user, coll = self.path_parser.get_user_coll(coll) url = request.query.get('url', '') if not url or not self.manager.can_write_coll(user, coll): raise HTTPError(status=404, body='No Such Page') title = request.query.get('title', '') add_page = request.query.get('addpage', False) html_text = request.body.read().decode('utf-8') #host = get_host() host = WbRequest.make_host_prefix(request.environ) prefix = request.query.get('prefix', host) orig_html = HTMLDomUnRewriter.unrewrite_html(host, prefix, html_text) dt = datetime.utcnow() sesh_id = self.path_parser.get_coll_path(user, coll) target = dict(output_dir=self.path_parser.get_archive_dir(user, coll), sesh_id=sesh_id.replace('/', ':'), user_id=user, name_prefix=self.path_parser.get_name_prefix(user, coll), json_metadata={'snapshot': 'html', 'timestamp': str(dt)}, writer_type='-snapshot') if url.startswith('https://'): url = url.replace('https:', 'http:') req_headers = {'warcprox-meta': json.dumps(target), 'content-type': 'text/html', 'user-agent': request.headers.get('user-agent') } pagedata = {'url': url, 'title': title, 'tags': ['snapshot'], 'ts': datetime_to_timestamp(dt) } try: resp = requests.request(method='PUTRES', url=url, data=orig_html.encode('utf-8'), headers=req_headers, proxies=self.warcprox_proxies, verify=False) if add_page: self.manager.add_page(user, coll, pagedata) except Exception as e: import traceback traceback.print_exc() return {'status': 'err'} return {'status': resp.status_code}