def get_authorize_url(self, client_id, audience=None, state=None, redirect_uri=None, response_type='code', scope='openid', quote_via=quote_plus): """ use quote_via=urllib.quote to to urlencode spaces into "%20", the default is "+" """ params = { 'client_id': client_id, 'audience': audience, 'response_type': response_type, 'scope': scope, 'state': state, 'redirect_uri': redirect_uri } query = urlencode(params, doseq=True, quote_via=quote_via) \ if _ver > '34' \ else '&'.join(['{}={}'.format(quote_via(k, safe=''), quote_via(v, safe='')) for k, v in params.items()]) return urlunparse( ['https', self.domain, '/authorize', None, query, None])
def get(self, url, headers={}, allow_redirects=True): parsed_url = urlparse(url) response = self.head(_get_domain(url)) if response is None: return None (url, status_code) = response if status_code != 200: return None resolved_url = urlparse(url) url = urlunparse(( resolved_url.scheme, resolved_url.netloc, parsed_url.path, parsed_url.params, parsed_url.query, parsed_url.fragment, )) tools.log('GET: %s' % url, 'info') request = lambda x: _get(self._cfscrape, url, headers, self._timeout, allow_redirects, x) request.url = url return self._request_core(request)
def get_robots_txt_url(self, url): comps = parse_url(url) url = requote_uri( urlunparse([ comps.scheme, comps.host, 'robots.txt', None, comps.query, comps.fragment ])) return url
def get_url(self, path='/', websocket=False, remote=True, attach_api_key=True, userId=None, pass_uid=False, **query): '''construct a url for an emby request Parameters ---------- path : str uri path(excluding domain and port) of get request for emby websocket : bool, optional if true, then `ws(s)` are used instead of `http(s)` remote : bool, optional if true, remote-address is used (default True) attach_api_key : bool, optional if true, apikey is added to the query (default True) userId : str, optional uid to use, if none, default is used pass_uid : bool, optional if true, uid is added to the query (default False) query : karg dict additional parameters to set (part of url after the `?`) Also See -------- get : getJson : post : delete : Returns ------- full url ''' userId = userId or self.userid if attach_api_key and self.api_key: query.update({'api_key':self.api_key, 'deviceId': self.device_id}) if pass_uid: query['userId'] = userId if remote: url = self.urlremote or self.url else: url = self.url if websocket: scheme = {'http':'ws', 'https':'wss'}[url.scheme] else: scheme = url.scheme netloc = url.netloc + '/emby' url = urlunparse((scheme, netloc, path, '', '{params}', '')).format( UserId = userId, ApiKey = self.api_key, DeviceId = self.device_id, params = urlencode(query) ) return url[:-1] if url[-1] == '?' else url
def process_url(url): scheme, netloc, path, params, query, frag = urlparse(url) query = "" frag = "" params = "" if scheme == "": scheme = "http" if netloc == "" and path != "": netloc = path.split("/")[0] path = "" return urlunparse((scheme, netloc, path, params, query, frag))
def solve_cf_challenge(self, resp, **original_kwargs): start_time = time.time() body = resp.text parsed_url = urlparse(resp.url) domain = parsed_url.netloc submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain) cloudflare_kwargs = copy.deepcopy(original_kwargs) headers = cloudflare_kwargs.setdefault("headers", {}) headers["Referer"] = resp.url try: params = cloudflare_kwargs["params"] = OrderedDict( re.findall(r'name="(s|jschl_vc|pass)"(?: [^<>]*)? value="(.+?)"', body) ) for k in ("jschl_vc", "pass"): if k not in params: raise ValueError("%s is missing from challenge form" % k) except Exception as e: # Something is wrong with the page. # This may indicate Cloudflare has changed their anti-bot # technique. If you see this and are running the latest version, # please open a GitHub issue so I can update the code accordingly. raise ValueError( "Unable to parse Cloudflare anti-bot IUAM page: %s %s" % (e.message, BUG_REPORT) ) # Solve the Javascript challenge answer, delay = self.solve_challenge(body, domain) params["jschl_answer"] = answer # Requests transforms any request into a GET after a redirect, # so the redirect has to be handled manually here to allow for # performing other types of requests even as the first request. method = resp.request.method cloudflare_kwargs["allow_redirects"] = False # Cloudflare requires a delay before solving the challenge time.sleep(max(delay - (time.time() - start_time), 0)) # Send the challenge response and handle the redirect manually redirect = self.request(method, submit_url, **cloudflare_kwargs) redirect_location = urlparse(redirect.headers["Location"]) if not redirect_location.netloc: redirect_url = urlunparse( ( parsed_url.scheme, domain, redirect_location.path, redirect_location.params, redirect_location.query, redirect_location.fragment, ) ) return self.request(method, redirect_url, **original_kwargs) return self.request(method, redirect.headers["Location"], **original_kwargs)
def public_url(self): """ Return the public url, often the same as the private url, but it can be replaced by a url:// wrapper which caches passwords onto the disk. """ url = self.url if url.password: digest = md5(url.netloc).hexdigest() with open(self._cache_file(digest), 'w') as fhl: fhl.write(json.dumps({'netloc': url.netloc, 'scheme': url.scheme})) url = url._replace(netloc=digest, scheme='url') return urlunparse(url)
def solve_cf_challenge(self, resp, **original_kwargs): self.tries += 1 start_time = time.time() body = resp.text parsed_url = urlparse(resp.url) domain = parsed_url.netloc submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, domain) cloudflare_kwargs = copy.deepcopy(original_kwargs) headers = cloudflare_kwargs.setdefault("headers", {}) headers["Referer"] = resp.url try: params = cloudflare_kwargs["params"] = OrderedDict( re.findall( r'name="(s|jschl_vc|pass)"(?: [^<>]*)? value="(.+?)"', body)) for k in ("jschl_vc", "pass"): if k not in params: raise ValueError("%s is missing from challenge form" % k) except Exception as e: raise ValueError( "Unable to parse Cloudflare anti-bot IUAM page: %s %s" % (e.message, BUG_REPORT)) try: answer, delay = solve_challenge(body, domain) except: self.raise_captcha_error() params["jschl_answer"] = answer method = resp.request.method cloudflare_kwargs["allow_redirects"] = False if not self.delay: time.sleep(max(delay - (time.time() - start_time), 0)) else: time.sleep(self.delay) redirect = self.request(method, submit_url, **cloudflare_kwargs) redirect_location = urlparse(redirect.headers["Location"]) if not redirect_location.netloc: redirect_url = urlunparse(( parsed_url.scheme, domain, redirect_location.path, redirect_location.params, redirect_location.query, redirect_location.fragment, )) return self.request(method, redirect_url, **original_kwargs) return self.request(method, redirect.headers["Location"], **original_kwargs)
def sanitise_url(self, url: str, base_url='') -> str: """Cleans a given input URL and returns a RFC compliant URL as a string.""" if isinstance(url, bytes): url = url.decode('utf8') else: url = str(url) # Remove leading whitespaces from url url = url.lstrip() if base_url: url = urljoin(base_url, url) try: scheme, auth, host, port, path, query, fragment = parse_url(url) except: return None # Carefully reconstruct the network location netloc = auth or '' if netloc: netloc += '@' netloc += host if port: # Only report on ports if they are used in a non-standard way if scheme == 'http' and port == 80: pass elif scheme == 'https' and port == 443: pass else: netloc += ':' + str(port) # Bare domains aren't valid URLs. if not path: path = '/' url = requote_uri( urlunparse([scheme, netloc, path, None, query, fragment])) # Search engines ignore hash fragments hence we remove them from URLs url = url.split('#')[0] return url
def get_session(self, previousresponse, **kwargs): """Perform a session login and return a new session ID.""" if self.username is None or self.password is None: raise Unauthorized("Cannot authenticate without username/password") logger.info("Attempting to authenticate as {0}".format(self.username)) authdict = { "username": self.username, "password": self.password, "provider": self.provider } prep = previousresponse.request.copy() try: del prep.headers['Authorization'] except KeyError: pass # Better to ask for forgiveness than to look before you leap if 'Authorization' in prep.headers: del prep.headers['Authorization'] prep.prepare_method("post") p = urlparse(previousresponse.request.url) prep.prepare_url(urlunparse( [p.scheme, p.netloc, APIV1 + "/sessions", None, None, None]), params=None) logger.debug("Authenticating via url: {0}".format(prep.url)) prep.prepare_body(data=None, files=None, json=authdict) authresponse = previousresponse.connection.send( prep, **kwargs) # kwargs contains ssl _verify try: return authresponse.json()['sessionId'] except: if authresponse.status_code == 503 and 'should be bootstrapped' in authresponse.json( ).get('errorMessage', ''): raise NotBootstrapped(authresponse.json().get('errorMessage'), authresponse) raise Unauthorized("Authentication failed", authresponse)
def request(self, *args, **kwargs): method = args[0] protocol, host, port, path, query = self._normalize_url(args[1]) if not 'proxies' in kwargs: kwargs.update({'proxies': self.proxies}) if not 'allow_redirects' in kwargs: kwargs.update({'allow_redirects': self.allow_redirects}) if not 'timeout' in kwargs: kwargs['timeout'] = self.timeout if not 'verify' in kwargs: kwargs['verify'] = self.verify headers = copy.copy(self.headers) if 'headers' in kwargs: headers.update(kwargs['headers']) if 'user_agent' in kwargs: headers.update({'User-Agent': kwargs['user_agent']}) dst_host = host host_header = host if 'dest_ip_addr' in kwargs: dst_host = kwargs['dest_ip_addr'] elif self.dest_ip_addr: dst_host = self.dest_ip_addr elif self.use_dns_cache: dst_host = self.get_host_cache(host) if (protocol == 'http' and port != 80) or (protocol == 'https' and port != 443): dst_host += ':{0}'.format(port) host_header += ':{0}'.format(port) headers.update({'Host': host_header}) url = urlunparse([protocol, dst_host, path, None, query, None]) kwargs['headers'] = headers current_retry = 1 result = None last_exception = None while current_retry <= self.max_retries: try: result = super().request(method, url, **kwargs) time.sleep(self.delay) break except TooManyRedirects as e: raise RequestException('Too many redirects: {0}'.format(e)) except ConnectionError as e: if self.proxy_handler is not None: raise RequestException('Error with the proxy: {0}') continue except (ConnectTimeout, ReadTimeout, Timeout, IncompleteRead, socket.timeout) as ex: last_exception = ex continue finally: current_retry += 1 if current_retry > self.max_retries: raise (last_exception) return result
def _key_from_url(url): parsed = urlparse(url) return urlunparse((parsed.scheme.lower(), parsed.netloc.lower(), '', '', '', ''))
def __str__(self): """Return the private url, what we keep internally""" return urlunparse(self.url)
def file(self, filename): """Gets a managed url for a sub-file portion""" path = os.path.join(self.url.path, filename) return ManagedUrl(urlunparse(self.url._replace(path=path)))
def solve_cf_challenge(self, resp, **original_kwargs): start_time = time.time() body = resp.text parsed_url = urlparse(resp.url) domain = parsed_url.netloc challenge_form = re.search( r'\<form.*?id=\"challenge-form\".*?\/form\>', body, flags=re.S).group(0) # find challenge form method = re.search(r'method=\"(.*?)\"', challenge_form, flags=re.S).group(1) if self.org_method is None: self.org_method = resp.request.method submit_url = "%s://%s%s" % ( parsed_url.scheme, domain, re.search(r'action=\"(.*?)\"', challenge_form, flags=re.S).group(1).split('?')[0]) cloudflare_kwargs = copy.deepcopy(original_kwargs) headers = cloudflare_kwargs.setdefault("headers", {}) headers["Referer"] = resp.url try: cloudflare_kwargs["params"] = dict() cloudflare_kwargs["data"] = dict() if len( re.search(r'action=\"(.*?)\"', challenge_form, flags=re.S).group(1).split('?')) != 1: for param in re.search( r'action=\"(.*?)\"', challenge_form, flags=re.S).group(1).split('?')[1].split('&'): cloudflare_kwargs["params"].update( {param.split('=')[0]: param.split('=')[1]}) for input_ in re.findall(r'[^-] \<input.*?(?:\/>|\<\/input\>)', challenge_form, flags=re.S): if re.search(r'name=\"(.*?)\"', input_, flags=re.S).group(1) != 'jschl_answer': if method == 'POST': cloudflare_kwargs["data"].update( { re.search(r'name=\"(.*?)\"', input_, flags=re.S).group(1): re.search(r'value=\"(.*?)\"', input_, flags=re.S).group(1) }) elif method == 'GET': cloudflare_kwargs["params"].update( { re.search(r'name=\"(.*?)\"', input_, flags=re.S).group(1): re.search(r'value=\"(.*?)\"', input_, flags=re.S).group(1) }) if method == 'POST': for k in ("jschl_vc", "pass"): if k not in cloudflare_kwargs["data"]: raise ValueError("%s is missing from challenge form" % k) elif method == 'GET': for k in ("jschl_vc", "pass"): if k not in cloudflare_kwargs["params"]: raise ValueError("%s is missing from challenge form" % k) except Exception as e: # Something is wrong with the page. # This may indicate Cloudflare has changed their anti-bot # technique. If you see this and are running the latest version, # please open a GitHub issue so I can update the code accordingly. raise ValueError( "Unable to parse Cloudflare anti-bot IUAM page: %s %s" % (e, BUG_REPORT)) # Solve the Javascript challenge answer, delay = self.solve_challenge(body, domain) if method == 'POST': cloudflare_kwargs["data"]["jschl_answer"] = answer elif method == 'GET': cloudflare_kwargs["params"]["jschl_answer"] = answer # Requests transforms any request into a GET after a redirect, # so the redirect has to be handled manually here to allow for # performing other types of requests even as the first request. cloudflare_kwargs["allow_redirects"] = False # Cloudflare requires a delay before solving the challenge time.sleep(max(delay - (time.time() - start_time), 0)) # Send the challenge response and handle the redirect manually redirect = self.request(method, submit_url, **cloudflare_kwargs) if "Location" in redirect.headers: redirect_location = urlparse(redirect.headers["Location"]) if not redirect_location.netloc: redirect_url = urlunparse(( parsed_url.scheme, domain, redirect_location.path, redirect_location.params, redirect_location.query, redirect_location.fragment, )) return self.request(method, redirect_url, **original_kwargs) return self.request(method, redirect.headers["Location"], **original_kwargs) elif "Set-Cookie" in redirect.headers: if 'cf_clearance' in redirect.headers['Set-Cookie']: resp = self.request(self.org_method, submit_url, cookies=redirect.cookies) return resp else: return self.request(method, submit_url, **original_kwargs) else: resp = self.request(self.org_method, submit_url, **cloudflare_kwargs) return resp
def prepare_url(self, url, params): """Prepares the given HTTP URL.""" #: Accept objects that have string representations. #: We're unable to blindly call unicode/str functions #: as this will include the bytestring indicator (b'') #: on python 3.x. #: https://github.com/requests/requests/pull/2238 if isinstance(url, bytes): url = url.decode('utf8') else: url = str(url) # Remove leading whitespaces from url url = url.lstrip() need_quote = True if url.startswith(key_unquote): need_quote = False url = url.replace(key_unquote, "") # Don't do any URL preparation for non-HTTP schemes like `mailto`, # `data` etc to work around exceptions from `url_parse`, which # handles RFC 3986 only. if ':' in url and not url.lower().startswith('http'): self.url = url return # Support for unicode domain names and paths. try: scheme, auth, host, port, path, query, fragment = parse_url(url) except LocationParseError as e: raise InvalidURL(*e.args) if not scheme: error = ("Invalid URL {0!r}: No schema supplied. Perhaps you meant http://{0}?") error = error.format(to_native_string(url, 'utf8')) raise MissingSchema(error) if not host: raise InvalidURL("Invalid URL %r: No host supplied" % url) # In general, we want to try IDNA encoding the hostname if the string contains # non-ASCII characters. This allows users to automatically get the correct IDNA # behaviour. For strings containing only ASCII characters, we need to also verify # it doesn't start with a wildcard (*), before allowing the unencoded hostname. if not unicode_is_ascii(host): try: host = self._get_idna_encoded_host(host) except UnicodeError: raise InvalidURL('URL has an invalid label.') elif host.startswith(u'*'): raise InvalidURL('URL has an invalid label.') # Carefully reconstruct the network location netloc = auth or '' if netloc: netloc += '@' netloc += host if port: netloc += ':' + str(port) # Bare domains aren't valid URLs. if not path: path = '/' if isinstance(params, (str, bytes)): params = to_native_string(params) enc_params = self._encode_params(params) if enc_params: if query: query = '%s&%s' % (query, enc_params) else: query = enc_params if need_quote: url = requote_uri(urlunparse([scheme, netloc, path, None, query, fragment])) else: url = urlunparse([scheme, netloc, path, None, query, fragment]) self.url = url
def _prepend_host(self, path): scheme = self._batch_url_parsed.scheme netloc = self._batch_url_parsed.netloc parsed = urlparse(path) return urlunparse((scheme, netloc, parsed.path, parsed.params, parsed.query, parsed.fragment))