def test_parse_url(self): url_host_map = { 'http://google.com/mail': Url('http', host='google.com', path='/mail'), 'http://google.com/mail/': Url('http', host='google.com', path='/mail/'), 'google.com/mail': Url(host='google.com', path='/mail'), 'http://google.com/': Url('http', host='google.com', path='/'), 'http://google.com': Url('http', host='google.com'), 'http://google.com?foo': Url('http', host='google.com', path='', query='foo'), # Path/query/fragment '': Url(), '/': Url(path='/'), '?': Url(path='', query=''), '#': Url(path='', fragment=''), '#?/!google.com/?foo#bar': Url(path='', fragment='?/!google.com/?foo#bar'), '/foo': Url(path='/foo'), '/foo?bar=baz': Url(path='/foo', query='bar=baz'), '/foo?bar=baz#banana?apple/orange': Url(path='/foo', query='bar=baz', fragment='banana?apple/orange'), # Port 'http://google.com/': Url('http', host='google.com', path='/'), 'http://google.com:80/': Url('http', host='google.com', port=80, path='/'), 'http://google.com:/': Url('http', host='google.com', path='/'), 'http://google.com:80': Url('http', host='google.com', port=80), 'http://google.com:': Url('http', host='google.com'), # Auth 'http://*****:*****@localhost/': Url('http', auth='foo:bar', host='localhost', path='/'), 'http://foo@localhost/': Url('http', auth='foo', host='localhost', path='/'), 'http://*****:*****@baz@localhost/': Url('http', auth='foo:bar@baz', host='localhost', path='/'), 'http://@': Url('http', host=None, auth='') } for url, expected_url in url_host_map.items(): returned_url = parse_url(url) self.assertEqual(returned_url, expected_url)
def test_parse_url_bytes_to_str_python_2(self): url = parse_url(b"https://www.google.com/") assert url == Url('https', host='www.google.com', path='/') assert isinstance(url.scheme, str) assert isinstance(url.host, str) assert isinstance(url.path, str)
def __init__(self, proxy_url, username=None, password=None, num_pools=10, headers=None, **connection_pool_kw): parsed = parse_url(proxy_url) if parsed.scheme == 'socks5': socks_version = socks.PROXY_TYPE_SOCKS5 elif parsed.scheme == 'socks4': socks_version = socks.PROXY_TYPE_SOCKS4 else: raise ValueError( "Unable to determine SOCKS version from %s" % proxy_url ) self.proxy_url = proxy_url socks_options = { 'socks_version': socks_version, 'proxy_host': parsed.host, 'proxy_port': parsed.port, 'username': username, 'password': password, } connection_pool_kw['_socks_options'] = socks_options super(SOCKSProxyManager, self).__init__( num_pools, headers, **connection_pool_kw ) self.pool_classes_by_scheme = SOCKSProxyManager.pool_classes_by_scheme
def test_parse_url(self): url_host_map = { "http://google.com/mail": Url("http", host="google.com", path="/mail"), "http://google.com/mail/": Url("http", host="google.com", path="/mail/"), "google.com/mail": Url(host="google.com", path="/mail"), "http://google.com/": Url("http", host="google.com", path="/"), "http://google.com": Url("http", host="google.com"), "http://google.com?foo": Url("http", host="google.com", path="", query="foo"), # Path/query/fragment "": Url(), "/": Url(path="/"), "?": Url(path="", query=""), "#": Url(path="", fragment=""), "#?/!google.com/?foo#bar": Url(path="", fragment="?/!google.com/?foo#bar"), "/foo": Url(path="/foo"), "/foo?bar=baz": Url(path="/foo", query="bar=baz"), "/foo?bar=baz#banana?apple/orange": Url(path="/foo", query="bar=baz", fragment="banana?apple/orange"), # Port "http://google.com/": Url("http", host="google.com", path="/"), "http://google.com:80/": Url("http", host="google.com", port=80, path="/"), "http://google.com:/": Url("http", host="google.com", path="/"), "http://google.com:80": Url("http", host="google.com", port=80), "http://google.com:": Url("http", host="google.com"), # Auth "http://*****:*****@localhost/": Url("http", auth="foo:bar", host="localhost", path="/"), "http://foo@localhost/": Url("http", auth="foo", host="localhost", path="/"), "http://*****:*****@baz@localhost/": Url("http", auth="foo:bar@baz", host="localhost", path="/"), "http://@": Url("http", host=None, auth=""), } for url, expected_url in url_host_map.items(): returned_url = parse_url(url) self.assertEqual(returned_url, expected_url)
def test_parse_url_unicode_python_2(self): url = parse_url(u"https://www.google.com/") assert url == Url(u'https', host=u'www.google.com', path=u'/') assert isinstance(url.scheme, six.text_type) assert isinstance(url.host, six.text_type) assert isinstance(url.path, six.text_type)
def test_netloc(self): url_netloc_map = { "http://google.com/mail": "google.com", "http://google.com:80/mail": "google.com:80", "google.com/foobar": "google.com", "google.com:12345": "google.com:12345", } for url, expected_netloc in url_netloc_map.items(): self.assertEqual(parse_url(url).netloc, expected_netloc)
def test_netloc(self): url_netloc_map = { 'http://google.com/mail': 'google.com', 'http://google.com:80/mail': 'google.com:80', 'google.com/foobar': 'google.com', 'google.com:12345': 'google.com:12345', } for url, expected_netloc in url_netloc_map.items(): self.assertEqual(parse_url(url).netloc, expected_netloc)
def test_parse_url_normalization(self): """Assert parse_url normalizes the scheme/host, and only the scheme/host""" test_urls = [ ('HTTP://GOOGLE.COM/MAIL/', 'http://google.com/MAIL/'), ('HTTP://*****:*****@Example.com:8080/', 'http://*****:*****@example.com:8080/'), ('HTTPS://Example.Com/?Key=Value', 'https://example.com/?Key=Value'), ('Https://Example.Com/#Fragment', 'https://example.com/#Fragment'), ] for url, expected_normalized_url in test_urls: actual_normalized_url = parse_url(url).url self.assertEqual(actual_normalized_url, expected_normalized_url)
def test_request_uri(self): url_host_map = { 'http://google.com/mail': '/mail', 'http://google.com/mail/': '/mail/', 'http://google.com/': '/', 'http://google.com': '/', '': '/', '/': '/', '?': '/?', '#': '/', '/foo?bar=baz': '/foo?bar=baz', } for url, expected_request_uri in url_host_map.items(): returned_url = parse_url(url) self.assertEqual(returned_url.request_uri, expected_request_uri)
def test_request_uri(self): url_host_map = { "http://google.com/mail": "/mail", "http://google.com/mail/": "/mail/", "http://google.com/": "/", "http://google.com": "/", "": "/", "/": "/", "?": "/?", "#": "/", "/foo?bar=baz": "/foo?bar=baz", } for url, expected_request_uri in url_host_map.items(): returned_url = parse_url(url) self.assertEqual(returned_url.request_uri, expected_request_uri)
def validate_run_info(info: dict) -> bool: passed = False # ## Validate Binary Executable ## # binary = Path(shutil.which(info["binary"])) if binary.exists(): passed = True else: logger.error("binary", binary, "is not accessible from this process") # ## Check the URL For Validity ## # # ## By Checking It's Parts ## # link = url.parse_url(runInfo["target"]) for x in [link.hostname, link.path, link.url]: if x is not None: passed = True else: passed = False break # don't let other "True" values corrupt the state return passed
def test_parse_url_negative_port(self): with pytest.raises(LocationParseError): parse_url("https://www.google.com:-80/")
) -> str: """Try to request the login page to see if the URL is valid.""" import requests from requests.exceptions import SSLError from urllib3.util.url import parse_url kwargs: Dict[str, Any] = { "timeout": timeout, "verify": Options.ca_bundle or not Options.ssl_no_verify, "cert": client_certificate(), "headers": { "User-Agent": user_agent() }, } try: parse_url(url) log.debug(f"Testing URL {url!r}") full_url = f"{url}/{login_page}" if proxy: kwargs["proxies"] = proxy.settings(url=full_url) with requests.get(full_url, **kwargs) as resp: resp.raise_for_status() if resp.status_code == 200: # Happens when JSF is installed log.debug(f"Valid URL: {url}") return "" except SSLError as exc: if "CERTIFICATE_VERIFY_FAILED" in str(exc): raise InvalidSSLCertificate() elif "CERTIFICATE_REQUIRED" in str(exc): raise MissingClientSSLCertificate() elif "password is required" in str(exc):
def test_parse_url_invalid_IPv6(self): with pytest.raises(ValueError): parse_url('[::1')
def test_parse_url(self, url, expected_url): returned_url = parse_url(url) assert returned_url == expected_url
def test_parse_url_invalid_IPv6(self): with pytest.raises(LocationParseError): parse_url("[::1")
def test_request_uri(self, url: str, expected_request_uri: str) -> None: returned_url = parse_url(url) assert returned_url.request_uri == expected_request_uri
def test_invalid_url(self, url): with pytest.raises(LocationParseError): parse_url(url)
def test_url_vulnerabilities(self, url, expected_url): if expected_url is False: with pytest.raises(LocationParseError): parse_url(url) else: assert parse_url(url) == expected_url
def test_parse_url_without_idna(self): url = "http://\uD7FF.com" with pytest.raises(LocationParseError, match=f"Failed to parse: {url}"): parse_url(url)
def prepare_options(self): if is_blank(self._url): raise Exception("GoogleArtsCrawlerOption , url is blank!") uprs = parse_url(url=self._url) if not uprs.host == 'artsandculture.google.com': raise Exception("GoogleArtsCrawlerOption, url netloc is not `artsandculture.google.com`") self._url = "https://{0}{1}".format(uprs.host, uprs.path) # download webdriver if self._webdriver_execute_path is None and self._need_download_webdrive: default_webdrive_path = "webdriver" if os.path.isdir(default_webdrive_path): default_webdrive_files = os.listdir(default_webdrive_path) if len(default_webdrive_files) > 0: default_webdrive_execute_file = os.path.join(default_webdrive_path, default_webdrive_files[0]) if os.path.isfile(default_webdrive_execute_file): print("==> webdriver has exist at {0}".format(default_webdrive_execute_file)) self._webdriver_execute_path = default_webdrive_execute_file if self._webdriver_execute_path is None: if WINDOWS: os_name = "Windows" webdriver_download_url = "http://chromedriver.storage.googleapis.com/78.0.3904.70/chromedriver_win32.zip" elif DARWIN: os_name = "Mac OS" webdriver_download_url = "http://chromedriver.storage.googleapis.com/78.0.3904.70/chromedriver_mac64.zip" elif LINUX: os_name = "Linux" webdriver_download_url = "http://chromedriver.storage.googleapis.com/78.0.3904.70/chromedriver_linux64.zip" else: raise Exception("GoogleArtsCrawlerOptions, unknown platform !") print("==> current operation system : {0}".format(os_name)) print("==> prepare download webdriver : {0}".format(webdriver_download_url)) default_download_tmp = "tmp" webdriver_zip_filename = webdriver_download_url.split("/")[-1] webdriver_local_zip_filepath = os.path.join(default_download_tmp, webdriver_zip_filename) # not exist if not os.path.isfile(webdriver_local_zip_filepath): http = SOCKSProxyManager('socks5://localhost:1086/') # http = PoolManager() response = http.request('GET', webdriver_download_url, preload_content=False) if not os.path.isdir(default_download_tmp): os.mkdir(default_download_tmp) with open(webdriver_local_zip_filepath, mode="wb") as fd: while True: data = response.read(1024) if not data: break fd.write(data) response.release_conn() print("==> webdriver zip file download finished , location at : {0}".format( os.path.abspath(webdriver_local_zip_filepath))) else: print("==> webdriver zip file has existed at {0}".format(webdriver_local_zip_filepath)) with ZipFile(webdriver_local_zip_filepath, 'r') as zipfile: zipfile.extractall(path=default_webdrive_path) if self._need_clear_cache: shutil.rmtree(default_download_tmp) self._webdriver_execute_path = os.path.join(default_webdrive_path, os.listdir(default_webdrive_path)[0]) if is_blank(self._webdriver_execute_path): raise Exception("GoogleArtsCrawlerOption , webdriver_execute_path is blank!") if not os.path.isfile(self._webdriver_execute_path): raise Exception("GoogleArtsCrawlerOption , webdriver_execute_path is not exist, this is file!") if LINUX or DARWIN: os.chmod(self._webdriver_execute_path, 0o777) # self._chrome_options.binary_location = os.path.abspath(self._webdriver_execute_path) mobile_emulation = { "deviceMetrics": {"width": self._size, "height": self._size, "pixelRatio": 1.0}, "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 " "(KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"} self._chrome_options.add_experimental_option("mobileEmulation", mobile_emulation) self._chrome_options.add_argument('--no-sandbox') self._chrome_options.add_argument('--disable-dev-shm-usage') self._chrome_options.add_argument('--disable-gpu') self._chrome_options.add_argument("--disable-dev-shm-usage") self._chrome_options.add_argument("start-maximized") self._chrome_options.add_argument("disable-infobars") self._chrome_options.add_argument("--disable-extensions") if not self._is_debug: self._chrome_options.add_argument("--headless") self._output_path = DEFAULT_GCO_OUTPUT_PATH if self._output_path is None else self._output_path self._size = DEFAULT_GCO_SIZE if self._size is None or self._size < 1 else self._size self._init_delay_time = DEFAULT_GCO_INIT_DELAY if self._init_delay_time is None or self._init_delay_time < 1 else self._init_delay_time if not os.path.isdir(self._output_path): os.makedirs(self._output_path) if not os.path.isdir(self._partial_tmp_path): os.makedirs(self._partial_tmp_path) if self._is_debug: print("GoogleArtsCrawlerOptions:") print("==> url:{0}".format(self._url)) print("==> webdriver_execute_path:{0}".format(os.path.abspath(self._webdriver_execute_path))) print("==> output :{0}".format(os.path.abspath(self._output_path))) return self
from sys import argv from urllib3.util.url import parse_url from pyperclip import paste from path import Path,getcwdu from sh import wget import tarfile target = '' try: if len(argv) == 2: target = argv[1] else: target = paste() except Exception: print('no arguments or clipboard contents') if not Path(target).exists(): wget(target, O='temporary_tar') maybeU = parse_url(target) maybeF = maybeU.path.split('/')[-1] targetF = getcwdu() + '/' + maybeF.split('.')[0] else: Path(target).move('temporary_tar') targetF = target.split('/')[-1] targetF = getcwdu() + '/' + targetF.split('.')[0] with tarfile.open('temporary_tar') as tf: tf.extractall() Path('temporary_tar').remove() print(Path(targetF), end='')
def test_parse_url_bytes_type_error(self) -> None: with pytest.raises(TypeError): parse_url(b"https://www.google.com/") # type: ignore[arg-type]
def test_netloc(self, url: str, expected_netloc: Optional[str]) -> None: assert parse_url(url).netloc == expected_netloc
def test_authority(self, url: str, expected_authority: Optional[str]) -> None: assert parse_url(url).authority == expected_authority
def test_request_uri(self, url, expected_request_uri): returned_url = parse_url(url) assert returned_url.request_uri == expected_request_uri
def test_parse_url(self, url: str, expected_url: Url) -> None: returned_url = parse_url(url) assert returned_url == expected_url assert returned_url.hostname == returned_url.host == expected_url.host
def test_netloc(self, url, expected_netloc): assert parse_url(url).netloc == expected_netloc
def test_parse_and_normalize_url_paths(self, url: str, expected_url: Url) -> None: actual_url = parse_url(url) assert actual_url == expected_url assert actual_url.url == expected_url.url
def test_parse_url_bytes_type_error(self): with pytest.raises(TypeError): parse_url(b"https://www.google.com/")
def get_client(self, device_id, secret): client = LocalClient(device_id, secret) parsed_url = parse_url(self.liveserver.url) client.host = parsed_url.host client.port = parsed_url.port return client
def test_parse_url(self): for url, expected_Url in chain(self.parse_url_host_map, self.non_round_tripping_parse_url_host_map.items()): returned_Url = parse_url(url) self.assertEqual(returned_Url, expected_Url)
def mongo_start(url, text): host = parse_url(url).host.split('.')[1] client = MongoClient('127.0.0.1', 27017) db = client['shrink_db'] summaries = db.host return fetch_data(url, summaries, text)
def test_parse_url_normalization(self, url, expected_normalized_url): """Assert parse_url normalizes the scheme/host, and only the scheme/host""" actual_normalized_url = parse_url(url).url assert actual_normalized_url == expected_normalized_url
def test_parse_url_invalid_IPv6(self): with pytest.raises(LocationParseError): parse_url('[::1')
def test_parse_and_normalize_url_paths(self, url, expected_url): actual_url = parse_url(url) assert actual_url == expected_url assert actual_url.url == expected_url.url
def __init__(self, username, password, url): self.username = username self.password = password self.url = parse_url(url) self.url = parse_url('https://'+self.url.host+'/rest/api/2/')
def get_url(url): return (parse_url(url))
def verify_host(url, allowed): return parse_url(url).host in allowed
def __make_url(self,endpoint): url = parse_url(self.url.scheme+"://"+self.url.host+self.url.path+endpoint).url return url
def test_parse_url(self): for url, expected_Url in chain( self.parse_url_host_map.items(), self.non_round_tripping_parse_url_host_map.items()): returned_Url = parse_url(url) self.assertEqual(returned_Url, expected_Url)
def test_parse_url(self, url, expected_url): returned_url = parse_url(url) assert returned_url == expected_url assert returned_url.hostname == returned_url.host == expected_url.host
def test_parse_url_bytes_type_error_python_3(self): with pytest.raises(TypeError): parse_url(b"https://www.google.com/")
def get_client(self, jwt_token=None): client = LocalClient(jwt_token) parsed_url = parse_url(self.liveserver.url) client.host = parsed_url.host client.port = parsed_url.port return client