def rebuild_auth(self, prepared_request: PreparedRequest, response: Response) -> None: """ Override Session.rebuild_auth. Strips the Authorization header if neither original URL nor redirected URL belong to an Earthdata Login (EDL) host. Also allows the default requests behavior of searching for relevant .netrc credentials if and only if a username and password weren't provided during object instantiation. Args: prepared_request: Object for the redirection destination. response: Object for the where we just came from. """ headers = prepared_request.headers redirect_hostname = cast(str, urlparse(prepared_request.url).hostname) original_hostname = cast(str, urlparse(response.request.url).hostname) if ('Authorization' in headers and (original_hostname != redirect_hostname) and not _is_edl_hostname(redirect_hostname)): del headers['Authorization'] if self.auth is None: # .netrc might have more auth for us on our new host. new_auth = get_netrc_auth( prepared_request.url) if self.trust_env else None if new_auth is not None: prepared_request.prepare_auth(new_auth) return
def load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: return None if params.get('content_type') == VideoLoader.CONTENT_TYPE: return None input_req = params['_input_req'] req_headers = input_req.get_req_headers() dt = timestamp_to_datetime(cdx['timestamp']) if cdx.get('memento_url'): req_headers['Accept-Datetime'] = datetime_to_http_date(dt) method = input_req.get_req_method() data = input_req.get_req_body() p = PreparedRequest() p.prepare_url(load_url, None) p.prepare_headers(None) p.prepare_auth(None, load_url) auth = p.headers.get('Authorization') if auth: req_headers['Authorization'] = auth load_url = p.url try: upstream_res = self.pool.urlopen(method=method, url=load_url, body=data, headers=req_headers, redirect=False, assert_same_host=False, preload_content=False, decode_content=False, retries=self.num_retries, timeout=params.get('_timeout')) except Exception as e: raise LiveResourceException(load_url) memento_dt = upstream_res.headers.get('Memento-Datetime') if memento_dt: dt = http_date_to_datetime(memento_dt) cdx['timestamp'] = datetime_to_timestamp(dt) elif cdx.get('memento_url'): # if 'memento_url' set and no Memento-Datetime header present # then its an error return None agg_type = upstream_res.headers.get('WebAgg-Type') if agg_type == 'warc': cdx['source'] = unquote(upstream_res.headers.get('WebAgg-Source-Coll')) return None, upstream_res.headers, upstream_res self.raise_on_self_redirect(params, cdx, str(upstream_res.status), upstream_res.headers.get('Location')) if upstream_res.version == 11: version = '1.1' else: version = '1.0' status = 'HTTP/{version} {status} {reason}\r\n' status = status.format(version=version, status=upstream_res.status, reason=upstream_res.reason) http_headers_buff = status orig_resp = upstream_res._original_response try: #pragma: no cover #PY 3 resp_headers = orig_resp.headers._headers for n, v in resp_headers: if n.lower() in self.SKIP_HEADERS: continue http_headers_buff += n + ': ' + v + '\r\n' except: #pragma: no cover #PY 2 resp_headers = orig_resp.msg.headers for n, v in zip(orig_resp.getheaders(), resp_headers): if n in self.SKIP_HEADERS: continue http_headers_buff += v http_headers_buff += '\r\n' http_headers_buff = http_headers_buff.encode('latin-1') try: fp = upstream_res._fp.fp if hasattr(fp, 'raw'): #pragma: no cover fp = fp.raw remote_ip = fp._sock.getpeername()[0] except: #pragma: no cover remote_ip = None warc_headers = {} warc_headers['WARC-Type'] = 'response' warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Target-URI'] = cdx['url'] warc_headers['WARC-Date'] = datetime_to_iso_date(dt) if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip warc_headers['Content-Type'] = 'application/http; msgtype=response' self._set_content_len(upstream_res.headers.get('Content-Length', -1), warc_headers, len(http_headers_buff)) warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) return (warc_headers, http_headers_buff, upstream_res)
def load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: return None if params.get('content_type') == VideoLoader.CONTENT_TYPE: return None if self.forward_proxy_prefix and not cdx.get('is_live'): load_url = self.forward_proxy_prefix + load_url input_req = params['_input_req'] req_headers = input_req.get_req_headers() dt = timestamp_to_datetime(cdx['timestamp']) if cdx.get('memento_url'): req_headers['Accept-Datetime'] = datetime_to_http_date(dt) method = input_req.get_req_method() data = input_req.get_req_body() p = PreparedRequest() try: p.prepare_url(load_url, None) except: raise LiveResourceException(load_url) p.prepare_headers(None) p.prepare_auth(None, load_url) auth = p.headers.get('Authorization') if auth: req_headers['Authorization'] = auth load_url = p.url # host is set to the actual host for live loading # ensure it is set to the load_url host if not cdx.get('is_live'): #req_headers.pop('Host', '') req_headers['Host'] = urlsplit(p.url).netloc referrer = cdx.get('set_referrer') if referrer: req_headers['Referer'] = referrer upstream_res = self._do_request_with_redir_check( method, load_url, data, req_headers, params, cdx) memento_dt = upstream_res.headers.get('Memento-Datetime') if memento_dt: dt = http_date_to_datetime(memento_dt) cdx['timestamp'] = datetime_to_timestamp(dt) elif cdx.get('memento_url'): # if 'memento_url' set and no Memento-Datetime header present # then its an error return None agg_type = upstream_res.headers.get('Warcserver-Type') if agg_type == 'warc': cdx['source'] = unquote( upstream_res.headers.get('Warcserver-Source-Coll')) return None, upstream_res.headers, upstream_res if upstream_res.version == 11: version = '1.1' else: version = '1.0' status = 'HTTP/{version} {status} {reason}\r\n' status = status.format(version=version, status=upstream_res.status, reason=upstream_res.reason) http_headers_buff = status orig_resp = upstream_res._original_response try: #pragma: no cover #PY 3 resp_headers = orig_resp.headers._headers for n, v in resp_headers: nl = n.lower() if nl in self.SKIP_HEADERS: continue if nl in self.UNREWRITE_HEADERS: v = self.unrewrite_header(cdx, v) http_headers_buff += n + ': ' + v + '\r\n' http_headers_buff += '\r\n' try: # http headers could be encoded as utf-8 (though non-standard) # first try utf-8 encoding http_headers_buff = http_headers_buff.encode('utf-8') except: # then, fall back to latin-1 http_headers_buff = http_headers_buff.encode('latin-1') except: #pragma: no cover #PY 2 resp_headers = orig_resp.msg.headers for line in resp_headers: n, v = line.split(':', 1) n = n.lower() v = v.strip() if n in self.SKIP_HEADERS: continue new_v = v if n in self.UNREWRITE_HEADERS: new_v = self.unrewrite_header(cdx, v) if new_v != v: http_headers_buff += n + ': ' + new_v + '\r\n' else: http_headers_buff += line # if python2, already byte headers, so leave as is http_headers_buff += '\r\n' try: fp = upstream_res._fp.fp if hasattr(fp, 'raw'): #pragma: no cover fp = fp.raw remote_ip = fp._sock.getpeername()[0] except: #pragma: no cover remote_ip = None warc_headers = {} warc_headers['WARC-Type'] = 'response' warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Target-URI'] = cdx['url'] warc_headers['WARC-Date'] = datetime_to_iso_date(dt) if not cdx.get('is_live'): now = datetime.datetime.utcnow() warc_headers['WARC-Source-URI'] = cdx.get('load_url') warc_headers['WARC-Creation-Date'] = datetime_to_iso_date(now) if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip ct = upstream_res.headers.get('Content-Type') if ct: metadata = self.get_custom_metadata(ct, dt) if metadata: warc_headers['WARC-JSON-Metadata'] = json.dumps(metadata) warc_headers['Content-Type'] = 'application/http; msgtype=response' if method == 'HEAD': content_len = 0 else: content_len = upstream_res.headers.get('Content-Length', -1) self._set_content_len(content_len, warc_headers, len(http_headers_buff)) warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) return (warc_headers, http_headers_buff, upstream_res)
def load_resource(self, cdx, params): load_url = cdx.get('load_url') if not load_url: return None if params.get('content_type') == VideoLoader.CONTENT_TYPE: return None if self.forward_proxy_prefix and not cdx.get('is_live'): load_url = self.forward_proxy_prefix + load_url input_req = params['_input_req'] req_headers = input_req.get_req_headers() dt = timestamp_to_datetime(cdx['timestamp']) if cdx.get('memento_url'): req_headers['Accept-Datetime'] = datetime_to_http_date(dt) method = input_req.get_req_method() data = input_req.get_req_body() p = PreparedRequest() try: p.prepare_url(load_url, None) except: raise LiveResourceException(load_url) p.prepare_headers(None) p.prepare_auth(None, load_url) auth = p.headers.get('Authorization') if auth: req_headers['Authorization'] = auth load_url = p.url # host is set to the actual host for live loading # ensure it is set to the load_url host if not cdx.get('is_live'): #req_headers.pop('Host', '') req_headers['Host'] = urlsplit(p.url).netloc referrer = cdx.get('set_referrer') if referrer: req_headers['Referer'] = referrer upstream_res = self._do_request_with_redir_check(method, load_url, data, req_headers, params, cdx) memento_dt = upstream_res.headers.get('Memento-Datetime') if memento_dt: dt = http_date_to_datetime(memento_dt) cdx['timestamp'] = datetime_to_timestamp(dt) elif cdx.get('memento_url'): # if 'memento_url' set and no Memento-Datetime header present # then its an error return None agg_type = upstream_res.headers.get('Warcserver-Type') if agg_type == 'warc': cdx['source'] = unquote(upstream_res.headers.get('Warcserver-Source-Coll')) return None, upstream_res.headers, upstream_res if upstream_res.version == 11: version = '1.1' else: version = '1.0' status = 'HTTP/{version} {status} {reason}\r\n' status = status.format(version=version, status=upstream_res.status, reason=upstream_res.reason) http_headers_buff = status orig_resp = upstream_res._original_response try: #pragma: no cover #PY 3 resp_headers = orig_resp.headers._headers for n, v in resp_headers: nl = n.lower() if nl in self.SKIP_HEADERS: continue if nl in self.UNREWRITE_HEADERS: v = self.unrewrite_header(cdx, v) http_headers_buff += n + ': ' + v + '\r\n' http_headers_buff += '\r\n' try: # http headers could be encoded as utf-8 (though non-standard) # first try utf-8 encoding http_headers_buff = http_headers_buff.encode('utf-8') except: # then, fall back to latin-1 http_headers_buff = http_headers_buff.encode('latin-1') except: #pragma: no cover #PY 2 resp_headers = orig_resp.msg.headers for line in resp_headers: n, v = line.split(':', 1) n = n.lower() v = v.strip() if n in self.SKIP_HEADERS: continue new_v = v if n in self.UNREWRITE_HEADERS: new_v = self.unrewrite_header(cdx, v) if new_v != v: http_headers_buff += n + ': ' + new_v + '\r\n' else: http_headers_buff += line # if python2, already byte headers, so leave as is http_headers_buff += '\r\n' try: fp = upstream_res._fp.fp if hasattr(fp, 'raw'): #pragma: no cover fp = fp.raw remote_ip = fp._sock.getpeername()[0] except: #pragma: no cover remote_ip = None warc_headers = {} warc_headers['WARC-Type'] = 'response' warc_headers['WARC-Record-ID'] = self._make_warc_id() warc_headers['WARC-Target-URI'] = cdx['url'] warc_headers['WARC-Date'] = datetime_to_iso_date(dt) if not cdx.get('is_live'): now = datetime.datetime.utcnow() warc_headers['WARC-Source-URI'] = cdx.get('load_url') warc_headers['WARC-Creation-Date'] = datetime_to_iso_date(now) if remote_ip: warc_headers['WARC-IP-Address'] = remote_ip ct = upstream_res.headers.get('Content-Type') if ct: metadata = self.get_custom_metadata(ct, dt) if metadata: warc_headers['WARC-JSON-Metadata'] = json.dumps(metadata) warc_headers['Content-Type'] = 'application/http; msgtype=response' if method == 'HEAD': content_len = 0 else: content_len = upstream_res.headers.get('Content-Length', -1) self._set_content_len(content_len, warc_headers, len(http_headers_buff)) warc_headers = StatusAndHeaders('WARC/1.0', warc_headers.items()) return (warc_headers, http_headers_buff, upstream_res)