def normalize_url(url): purl = rfc3986.urlparse(url) if purl.scheme is None and purl.host is None and purl.path is not None: # no protocol, no // : it is a path according to the rfc3986 # but we know it is a host purl = rfc3986.urlparse('//' + url) if purl.scheme is None: # The url starts with // # Add https (or http for .onion or i2p TLD) if model.host_use_http(purl.host): purl = purl.copy_with(scheme='http') else: purl = purl.copy_with(scheme='https') # first normalization # * idna encoding to avoid misleading host # * remove query and fragment # * remove empty path purl = purl.copy_with(scheme=purl.scheme.lower(), host=idna.encode(purl.host).decode('utf-8').lower(), path='' if purl.path == '/' else purl.path, query=None, fragment=None) # only https (exception: http for .onion and .i2p TLD) if (purl.scheme == 'https' and not model.host_use_http(purl.host)) or\ (purl.scheme == 'http' and model.host_use_http(purl.host)): # normalize the URL return rfc3986.normalize_uri(purl.geturl()) # return None
def __init__(self, scope, receive): self._starlette = StarletteRequest(scope, receive) self.formats = None self.encoding = "utf-8" headers = CaseInsensitiveDict() for header, value in self._starlette.headers.items(): headers[header] = value self.headers = ( headers ) #: A case-insensitive dictionary, containing all headers sent in the Request. self.mimetype = self.headers.get("Content-Type", "") self.method = ( self._starlette.method.lower() ) #: The incoming HTTP method used for the request, lower-cased. self.full_url = str( self._starlette.url ) #: The full URL of the Request, query parameters and all. self.url = rfc3986.urlparse( self.full_url) #: The parsed URL of the Request try: self.params = QueryDict( self.url.query ) #: A dictionary of the parsed query parameters used for the Request. except AttributeError: self.params = {}
def __repr__(self) -> str: class_name = self.__class__.__name__ url_str = str(self) if self._uri_reference.userinfo: url_str = (rfc3986.urlparse(url_str).copy_with( userinfo=f"{self.username}:[secure]").unsplit()) return f"{class_name}({url_str!r})"
def str_parse(text): n = urlparse(text.encode('utf8')) if n.scheme is None: if n.path is None: return u'' return n.path else: return text
def get_file_uri(url): url = rfc3986.urlparse(url) try: url = validate_uri(url) except (MissingComponentError, UnpermittedComponentError, InvalidComponentsError) as e: logging.error("File uri '{0}' not valid".format(url)) raise e return url
def save_acme_key_as_file(logger, bytes, user_provided_path): uri = rfc3986.urlparse(user_provided_path) if uri.scheme == "file": path = get_filepath(uri) save_file_to_disc(logger, bytes, path) elif uri.scheme == "s3": save_file_to_s3(logger, bytes, uri) else: raise ValueError( "Invalid acme account key: {!r}".format(user_provided_path))
def __repr__(self) -> str: class_name = self.__class__.__name__ url_str = str(self) if self._uri_reference.userinfo: # Mask any password component in the URL representation, to lower the # risk of unintended leakage, such as in debug information and logging. username = quote(self.username) url_str = (rfc3986.urlparse(url_str).copy_with( userinfo=f"{username}:[secure]").unsplit()) return f"{class_name}({url_str!r})"
def __init__(self): ipfs_rpc_endpoint = "https://ipfs.singularitynet.io:80" ipfs_rpc_endpoint = urlparse(ipfs_rpc_endpoint) ipfs_scheme = ipfs_rpc_endpoint.scheme if ipfs_rpc_endpoint.scheme else "http" ipfs_port = ipfs_rpc_endpoint.port if ipfs_rpc_endpoint.port else 5001 self.ipfs_client = ipfsapi.connect(urljoin(ipfs_scheme, ipfs_rpc_endpoint.hostname), ipfs_port, session=True)
def url_path(url_path): """Raises an error if the url_path doesn't look like a URL Path.""" try: p_url = rfc3986.urlparse(rfc3986.normalize_uri(url_path)) if (p_url.scheme or p_url.userinfo or p_url.host or p_url.port or p_url.path is None or not p_url.path.startswith('/')): raise exceptions.InvalidURLPath(url_path=url_path) except Exception: raise exceptions.InvalidURLPath(url_path=url_path) return True
def url(url): """Raises an error if the url doesn't look like a URL.""" try: if not rfc3986.is_valid_uri(url, require_scheme=True): raise exceptions.InvalidURL(url=url) p_url = rfc3986.urlparse(rfc3986.normalize_uri(url)) if p_url.scheme != 'http' and p_url.scheme != 'https': raise exceptions.InvalidURL(url=url) except Exception: raise exceptions.InvalidURL(url=url) return True
def url_validation(url): nurl = rfc3986.normalize_uri(url) if nurl != url: return False, f'URL must be normalized to {nurl}' purl = rfc3986.urlparse(nurl) if not ((purl.scheme == 'https' and not host_use_http(purl.host)) or (purl.scheme == 'http' and host_use_http(purl.host))): return False, 'the protocol is neither https nor http with an .onion/.i2p TLD' if purl.query is not None: return False, 'no query in the URL' if purl.fragment is not None: return False, 'no fragment in the URL' return True, None
def _validate_info(self, broker_info) -> False: self.logger.debug("Validating " + broker_info) parseduri = urlparse(broker_info) if not (parseduri.scheme in ["mqtt", "ws"]): return False self.broker_url = parseduri.host self.broker_port = parseduri.port self.broker_user = parseduri.userinfo self.logger.debug("broker_user {}".format(self.broker_user)) self.logger.debug("broker_url {}, broker_port: {}".format(self.broker_url, self.broker_port)) if not (self.broker_url and self.broker_port): return False return True
def _parse_sophora_url(url: str) -> Tuple[str, str, Optional[int]]: # Special cases if url == "https://www1.wdr.de/nachrichten/nrw": # TODO: Investigate if there are more like this url = "https://www1.wdr.de/nachrichten/index.html" # Ensure that overview pages with missing "index.html" suffix # get related to the same SophoraID if url.endswith("/"): logger.debug("Adding index.html suffix") url = url + "index.html" parsed = urlparse(url) match = re.match( r"(.*)/(.*?)(?:~_page-(\d+))?\.(?:html|amp)$", unquote(parsed.path), ) if match is None: # Parsing errors that are known and we want to ignore match_expected = re.match( r".*\.(?:jsp|pdf|news)$", unquote(parsed.path), ) or re.match( r".*/:~:text=.*$", unquote(parsed.path), ) if match_expected is None: logger.error("Unexpected parsing error: {}", url) sentry_sdk.capture_message( f"Failed parsing URL with unexpected format: {url}", level="error", ) else: logger.debug("Ignored parsing error: {}", url) raise SkipPageException(url) node = match.group(1) sophora_id = match.group(2) # Cut off any other weird Sophora parameters sophora_id = re.sub(r"~.*", "", sophora_id) if sophora_id == "index": sophora_id = f"{node}/{sophora_id}" sophora_page = match.group(3) if sophora_page is not None: sophora_page = int(sophora_page) return sophora_id, node, sophora_page
def _get_grpc_channel(self): endpoint = self.options.get("endpoint", None) if endpoint is None: endpoint = self.service_metadata.get_all_endpoints_for_group(self.group["group_name"])[0] endpoint_object = urlparse(endpoint) if endpoint_object.port is not None: channel_endpoint = endpoint_object.hostname + ":" + str(endpoint_object.port) else: channel_endpoint = endpoint_object.hostname if endpoint_object.scheme == "http": return grpc.insecure_channel(channel_endpoint) elif endpoint_object.scheme == "https": return grpc.secure_channel(channel_endpoint, grpc.ssl_channel_credentials()) else: raise ValueError('Unsupported scheme in service metadata ("{}")'.format(endpoint_object.scheme))
def _validate_info(self, broker_info) -> False: self.logger.debug("Validating " + broker_info) parsed_uri = urlparse(broker_info) if not (parsed_uri.scheme in ["http", "https"]): return False if not parsed_uri.host: return False self.url = "{}://{}".format(parsed_uri.scheme, parsed_uri.host if not parsed_uri.port else "{}:{}".format(parsed_uri.host, parsed_uri.port)) if not (self.url and self.bucket and self.org and self.token and self.measurement): return False self.logger.debug( "url {}, org: {}, bucket: {}, measurement: {}, token: {}".format(self.url, self.org, self.bucket, self.measurement, self.token)) return True
def _get_base_grpc_channel(self, endpoint): endpoint_object = urlparse(endpoint) if endpoint_object.port is not None: channel_endpoint = endpoint_object.hostname + ":" + str( endpoint_object.port) else: channel_endpoint = endpoint_object.hostname if endpoint_object.scheme == "http": return grpc.insecure_channel(channel_endpoint) elif endpoint_object.scheme == "https": return grpc.secure_channel(channel_endpoint, grpc.ssl_channel_credentials()) else: raise ValueError( 'Unsupported scheme in service metadata ("{}")'.format( endpoint_object.scheme))
def setup_acme_client(s3_client, acme_directory_url, acme_account_key): uri = rfc3986.urlparse(acme_account_key) if uri.scheme == "file": with open(uri.path) as f: key = f.read() elif uri.scheme == "s3": # uri.path includes a leading "/" response = s3_client.get_object(Bucket=uri.host, Key=uri.path[1:]) key = response["Body"].read() else: raise ValueError( "Invalid acme account key: {!r}".format(acme_account_key)) key = serialization.load_pem_private_key(key, password=None, backend=default_backend()) return acme_client_for_private_key(acme_directory_url, key)
def url_path(url_path): """Raises an error if the url_path doesn't look like a URL Path.""" try: p_url = rfc3986.urlparse(rfc3986.normalize_uri(url_path)) invalid_path = ( p_url.scheme or p_url.userinfo or p_url.host or p_url.port or p_url.path is None or not p_url.path.startswith('/') ) if invalid_path: raise exceptions.InvalidURLPath(url_path=url_path) except Exception: raise exceptions.InvalidURLPath(url_path=url_path) return True
def setup_acme_client(s3_client, acme_directory_url, acme_account_key): uri = rfc3986.urlparse(acme_account_key) if uri.scheme == "file": with open(uri.path) as f: key = f.read() elif uri.scheme == "s3": # uri.path includes a leading "/" response = s3_client.get_object(Bucket=uri.host, Key=uri.path[1:]) key = response["Body"].read() else: raise ValueError( "Invalid acme account key: {!r}".format(acme_account_key) ) key = serialization.load_pem_private_key( key, password=None, backend=default_backend() ) return acme_client_for_private_key(acme_directory_url, key)
def _get_grpc_channel(self): endpoint = self.options.get("endpoint", None) if endpoint is None: endpoint = self.metadata["endpoint"] endpoint_object = urlparse(endpoint) if endpoint_object.port is not None: channel_endpoint = endpoint_object.hostname + \ ":" + str(endpoint_object.port) else: channel_endpoint = endpoint_object.hostname print("Opening grpc to " + channel_endpoint) if endpoint_object.scheme == "http": return grpc.insecure_channel(channel_endpoint) elif endpoint_object.scheme == "https": return grpc.secure_channel(channel_endpoint, grpc.ssl_channel_credentials()) else: raise ValueError( 'Unsupported scheme in service metadata ("{}")'.format( endpoint_object.scheme))
def __init__(self, config, metadata_provider=None): self._config = config self._metadata_provider = metadata_provider # Instantiate Ethereum client eth_rpc_endpoint = self._config.get( "eth_rpc_endpoint", "https://mainnet.infura.io/v3/e7732e1f679e461b9bb4da5653ac3fc2") provider = web3.HTTPProvider(eth_rpc_endpoint) self.web3 = web3.Web3(provider) self.web3.eth.setGasPriceStrategy(medium_gas_price_strategy) # Get MPE contract address from config if specified; mostly for local testing _mpe_contract_address = self._config.get("mpe_contract_address", None) if _mpe_contract_address is None: self.mpe_contract = MPEContract(self.web3) else: self.mpe_contract = MPEContract(self.web3, _mpe_contract_address) # Instantiate IPFS client ipfs_rpc_endpoint = self._config.get( "ipfs_rpc_endpoint", "https://ipfs.singularitynet.io:80") ipfs_rpc_endpoint = urlparse(ipfs_rpc_endpoint) ipfs_scheme = ipfs_rpc_endpoint.scheme if ipfs_rpc_endpoint.scheme else "http" ipfs_port = ipfs_rpc_endpoint.port if ipfs_rpc_endpoint.port else 5001 self.ipfs_client = ipfsapi.connect( urljoin(ipfs_scheme, ipfs_rpc_endpoint.hostname), ipfs_port) # Get Registry contract address from config if specified; mostly for local testing _registry_contract_address = self._config.get( "registry_contract_address", None) if _registry_contract_address is None: self.registry_contract = get_contract_object( self.web3, "Registry.json") else: self.registry_contract = get_contract_object( self.web3, "Registry.json", _registry_contract_address) self.account = Account(self.web3, config, self.mpe_contract)
def setup_acme_client(s3_client, acme_directory_url, acme_account_key): uri = rfc3986.urlparse(acme_account_key) if uri.scheme == 'file' or uri.scheme is None: if uri.host is None: path = uri.path elif uri.path is None: path = uri.host else: path = os.path.join(uri.host, uri.path) with open(path) as f: key = f.read() elif uri.scheme == 's3': # uri.path includes a leading "/" response = s3_client.get_object(Bucket=uri.host, Key=uri.path[1:]) key = response['Body'].read() else: raise ValueError( 'Invalid acme account key: {!r}'.format(acme_account_key)) key = serialization.load_pem_private_key(key.encode("utf-8"), password=None, backend=default_backend()) return acme_client_for_private_key(acme_directory_url, key)
def _parse_row(element): if element[1] == "-": return None parsed = urlparse(element[0]) # Apparently sometimes there's no host if parsed.host is None or parsed.path is None: return None # check if url part of property if not parsed.host.endswith("wdr.de") or not parsed.path.startswith( "/nachrichten"): return None # get cononical url and get_parameters query = parsed.query url = parsed.copy_with(query=None, fragment=None).unsplit() # parse headline headline_raw = html.unescape(element[1].split("_")[-1]) headline = re.sub(r"<.*?>", "", headline_raw) return url, headline, query
def __init__(self, config): self._config = config # Instantiate Ethereum client eth_rpc_endpoint = self._config.get("eth_rpc_endpoint", "https://mainnet.infura.io") provider = web3.HTTPProvider(eth_rpc_endpoint) self.web3 = web3.Web3(provider) self.web3.eth.setGasPriceStrategy(medium_gas_price_strategy) self.mpe_contract = MPEContract(self.web3) # Instantiate IPFS client ipfs_rpc_endpoint = self._config.get( "ipfs_rpc_endpoint", "https://ipfs.singularitynet.io:80") ipfs_rpc_endpoint = urlparse(ipfs_rpc_endpoint) ipfs_scheme = ipfs_rpc_endpoint.scheme if ipfs_rpc_endpoint.scheme else "http" ipfs_port = ipfs_rpc_endpoint.port if ipfs_rpc_endpoint.port else 5001 self.ipfs_client = ipfsapi.connect( urljoin(ipfs_scheme, ipfs_rpc_endpoint.hostname), ipfs_port) self.registry_contract = get_contract_object(self.web3, "Registry.json") self.account = Account(self.web3, config, self.mpe_contract)
def url(self): return rfc3986.urlparse(str(self._starlette.url))
def test_urlparse_a_unicode_hostname(): url_bytestring = SNOWMAN_HOST unicode_url = url_bytestring.decode('utf-8') parsed = urlparse(url_bytestring) assert parsed.host == unicode_url[7:]
def test_urlparse_a_unicode_hostname_with_auth(): url = b'http://userinfo@' + SNOWMAN + b'.com' parsed = urlparse(url) assert parsed.userinfo == 'userinfo'
def prepare_url(self, url, params, validate=False): """Prepares the given HTTP URL.""" # : Accept objects that have string representations. #: We're unable to blindly call unicode/str functions #: as this will include the bytestring indicator (b'') #: on python 3.x. #: https://github.com/requests/requests/pull/2238 if isinstance(url, bytes): url = url.decode("utf8") else: url = str(url) # Ignore any leading and trailing whitespace characters. url = url.strip() # Don't do any URL preparation for non-HTTP schemes like `mailto`, # `data` etc to work around exceptions from `url_parse`, which # handles RFC 3986 only. if ":" in url and not url.lower().startswith("http"): self.url = url return # Support for unicode domain names and paths. try: uri = rfc3986.urlparse(url) if validate: rfc3986.normalize_uri(url) except rfc3986.exceptions.RFC3986Exception: raise InvalidURL(f"Invalid URL {url!r}: URL is imporoper.") if not uri.scheme: error = ( "Invalid URL {0!r}: No scheme supplied. Perhaps you meant http://{0}?" ) error = error.format(to_native_string(url, "utf8")) raise MissingScheme(error) if not uri.host: raise InvalidURL(f"Invalid URL {url!r}: No host supplied") # In general, we want to try IDNA encoding the hostname if the string contains # non-ASCII characters. This allows users to automatically get the correct IDNA # behaviour. For strings containing only ASCII characters, we need to also verify # it doesn't start with a wildcard (*), before allowing the unencoded hostname. if not unicode_is_ascii(uri.host): try: uri = uri.copy_with(host=self._get_idna_encoded_host(uri.host)) except UnicodeError: raise InvalidURL("URL has an invalid label.") elif uri.host.startswith("*"): raise InvalidURL("URL has an invalid label.") # Bare domains aren't valid URLs. if not uri.path: uri = uri.copy_with(path="/") if isinstance(params, (str, bytes)): params = to_native_string(params) enc_params = self._encode_params(params) if enc_params: if uri.query: uri = uri.copy_with(query=f"{uri.query}&{enc_params}") else: uri = uri.copy_with(query=enc_params) # url = requote_uri( # urlunparse([uri.scheme, uri.authority, uri.path, None, uri.query, uri.fragment]) # ) # Normalize the URI. self.url = rfc3986.normalize_uri(uri.unsplit())
def url(self): """The parsed URL of the Request.""" return rfc3986.urlparse(self.full_url)
def test_urlparse_an_invalid_authority_parses_port(): url = 'http://*****:*****@r@[::1]:80/get' parsed = urlparse(url) assert parsed.port == 80 assert parsed.userinfo == 'foo:b@r' assert parsed.hostname == '[::1]'
def test_unsplit_idna_a_unicode_hostname(): parsed = urlparse(SNOWMAN_HOST) assert parsed.unsplit(use_idna=True) == SNOWMAN_IDNA_HOST
def _get_ipfs_client(self): ipfs_endpoint = urlparse(self.config.get_ipfs_endpoint()) ipfs_scheme = ipfs_endpoint.scheme if ipfs_endpoint.scheme else "http" ipfs_port = ipfs_endpoint.port if ipfs_endpoint.port else 5001 return ipfsapi.connect(urljoin(ipfs_scheme, ipfs_endpoint.hostname), ipfs_port)
def test_port_parsing(port): with pytest.raises(exceptions.InvalidPort): rfc3986.urlparse('https://httpbin.org:{0}/get'.format(port))
def uri_validator(uri): try: result = urlparse(uri) return all([result.scheme, result.netloc]) except: return False
def do_parse(uri): # Parse the incoming URI return rfc3986.urlparse(uri)