Ejemplo n.º 1
0
    def test_parse_url(self):
        url_host_map = {
            'http://google.com/mail': Url('http', host='google.com', path='/mail'),
            'http://google.com/mail/': Url('http', host='google.com', path='/mail/'),
            'google.com/mail': Url(host='google.com', path='/mail'),
            'http://google.com/': Url('http', host='google.com', path='/'),
            'http://google.com': Url('http', host='google.com'),
            'http://google.com?foo': Url('http', host='google.com', path='', query='foo'),

            # Path/query/fragment
            '': Url(),
            '/': Url(path='/'),
            '?': Url(path='', query=''),
            '#': Url(path='', fragment=''),
            '#?/!google.com/?foo#bar': Url(path='', fragment='?/!google.com/?foo#bar'),
            '/foo': Url(path='/foo'),
            '/foo?bar=baz': Url(path='/foo', query='bar=baz'),
            '/foo?bar=baz#banana?apple/orange': Url(path='/foo', query='bar=baz', fragment='banana?apple/orange'),

            # Port
            'http://google.com/': Url('http', host='google.com', path='/'),
            'http://google.com:80/': Url('http', host='google.com', port=80, path='/'),
            'http://google.com:/': Url('http', host='google.com', path='/'),
            'http://google.com:80': Url('http', host='google.com', port=80),
            'http://google.com:': Url('http', host='google.com'),

            # Auth
            'http://*****:*****@localhost/': Url('http', auth='foo:bar', host='localhost', path='/'),
            'http://foo@localhost/': Url('http', auth='foo', host='localhost', path='/'),
            'http://*****:*****@baz@localhost/': Url('http', auth='foo:bar@baz', host='localhost', path='/'),
            'http://@': Url('http', host=None, auth='')
        }
        for url, expected_url in url_host_map.items():
            returned_url = parse_url(url)
            self.assertEqual(returned_url, expected_url)
Ejemplo n.º 2
0
    def test_parse_url_bytes_to_str_python_2(self):
        url = parse_url(b"https://www.google.com/")
        assert url == Url('https', host='www.google.com', path='/')

        assert isinstance(url.scheme, str)
        assert isinstance(url.host, str)
        assert isinstance(url.path, str)
Ejemplo n.º 3
0
    def __init__(self, proxy_url, username=None, password=None,
                 num_pools=10, headers=None, **connection_pool_kw):
        parsed = parse_url(proxy_url)

        if parsed.scheme == 'socks5':
            socks_version = socks.PROXY_TYPE_SOCKS5
        elif parsed.scheme == 'socks4':
            socks_version = socks.PROXY_TYPE_SOCKS4
        else:
            raise ValueError(
                "Unable to determine SOCKS version from %s" % proxy_url
            )

        self.proxy_url = proxy_url

        socks_options = {
            'socks_version': socks_version,
            'proxy_host': parsed.host,
            'proxy_port': parsed.port,
            'username': username,
            'password': password,
        }
        connection_pool_kw['_socks_options'] = socks_options

        super(SOCKSProxyManager, self).__init__(
            num_pools, headers, **connection_pool_kw
        )

        self.pool_classes_by_scheme = SOCKSProxyManager.pool_classes_by_scheme
Ejemplo n.º 4
0
 def test_parse_url(self):
     url_host_map = {
         "http://google.com/mail": Url("http", host="google.com", path="/mail"),
         "http://google.com/mail/": Url("http", host="google.com", path="/mail/"),
         "google.com/mail": Url(host="google.com", path="/mail"),
         "http://google.com/": Url("http", host="google.com", path="/"),
         "http://google.com": Url("http", host="google.com"),
         "http://google.com?foo": Url("http", host="google.com", path="", query="foo"),
         # Path/query/fragment
         "": Url(),
         "/": Url(path="/"),
         "?": Url(path="", query=""),
         "#": Url(path="", fragment=""),
         "#?/!google.com/?foo#bar": Url(path="", fragment="?/!google.com/?foo#bar"),
         "/foo": Url(path="/foo"),
         "/foo?bar=baz": Url(path="/foo", query="bar=baz"),
         "/foo?bar=baz#banana?apple/orange": Url(path="/foo", query="bar=baz", fragment="banana?apple/orange"),
         # Port
         "http://google.com/": Url("http", host="google.com", path="/"),
         "http://google.com:80/": Url("http", host="google.com", port=80, path="/"),
         "http://google.com:/": Url("http", host="google.com", path="/"),
         "http://google.com:80": Url("http", host="google.com", port=80),
         "http://google.com:": Url("http", host="google.com"),
         # Auth
         "http://*****:*****@localhost/": Url("http", auth="foo:bar", host="localhost", path="/"),
         "http://foo@localhost/": Url("http", auth="foo", host="localhost", path="/"),
         "http://*****:*****@baz@localhost/": Url("http", auth="foo:bar@baz", host="localhost", path="/"),
         "http://@": Url("http", host=None, auth=""),
     }
     for url, expected_url in url_host_map.items():
         returned_url = parse_url(url)
         self.assertEqual(returned_url, expected_url)
Ejemplo n.º 5
0
    def test_parse_url_unicode_python_2(self):
        url = parse_url(u"https://www.google.com/")
        assert url == Url(u'https', host=u'www.google.com', path=u'/')

        assert isinstance(url.scheme, six.text_type)
        assert isinstance(url.host, six.text_type)
        assert isinstance(url.path, six.text_type)
Ejemplo n.º 6
0
    def test_netloc(self):
        url_netloc_map = {
            "http://google.com/mail": "google.com",
            "http://google.com:80/mail": "google.com:80",
            "google.com/foobar": "google.com",
            "google.com:12345": "google.com:12345",
        }

        for url, expected_netloc in url_netloc_map.items():
            self.assertEqual(parse_url(url).netloc, expected_netloc)
Ejemplo n.º 7
0
    def test_netloc(self):
        url_netloc_map = {
            'http://google.com/mail': 'google.com',
            'http://google.com:80/mail': 'google.com:80',
            'google.com/foobar': 'google.com',
            'google.com:12345': 'google.com:12345',
        }

        for url, expected_netloc in url_netloc_map.items():
            self.assertEqual(parse_url(url).netloc, expected_netloc)
Ejemplo n.º 8
0
 def test_parse_url_normalization(self):
     """Assert parse_url normalizes the scheme/host, and only the scheme/host"""
     test_urls = [
         ('HTTP://GOOGLE.COM/MAIL/', 'http://google.com/MAIL/'),
         ('HTTP://*****:*****@Example.com:8080/', 'http://*****:*****@example.com:8080/'),
         ('HTTPS://Example.Com/?Key=Value', 'https://example.com/?Key=Value'),
         ('Https://Example.Com/#Fragment', 'https://example.com/#Fragment'),
     ]
     for url, expected_normalized_url in test_urls:
         actual_normalized_url = parse_url(url).url
         self.assertEqual(actual_normalized_url, expected_normalized_url)
Ejemplo n.º 9
0
 def test_request_uri(self):
     url_host_map = {
         'http://google.com/mail': '/mail',
         'http://google.com/mail/': '/mail/',
         'http://google.com/': '/',
         'http://google.com': '/',
         '': '/',
         '/': '/',
         '?': '/?',
         '#': '/',
         '/foo?bar=baz': '/foo?bar=baz',
     }
     for url, expected_request_uri in url_host_map.items():
         returned_url = parse_url(url)
         self.assertEqual(returned_url.request_uri, expected_request_uri)
Ejemplo n.º 10
0
 def test_request_uri(self):
     url_host_map = {
         "http://google.com/mail": "/mail",
         "http://google.com/mail/": "/mail/",
         "http://google.com/": "/",
         "http://google.com": "/",
         "": "/",
         "/": "/",
         "?": "/?",
         "#": "/",
         "/foo?bar=baz": "/foo?bar=baz",
     }
     for url, expected_request_uri in url_host_map.items():
         returned_url = parse_url(url)
         self.assertEqual(returned_url.request_uri, expected_request_uri)
Ejemplo n.º 11
0
def validate_run_info(info: dict) -> bool:
	passed = False
	# ## Validate Binary Executable ## #
	binary = Path(shutil.which(info["binary"]))
	if binary.exists():
		passed = True
	else:
		logger.error("binary", binary, "is not accessible from this process")
	# ## Check the URL For Validity ## #
	# ##   By Checking It's Parts   ## #
	link = url.parse_url(runInfo["target"])
	for x in [link.hostname, link.path, link.url]:
		if x is not None:
			passed = True
		else:
			passed = False
			break  # don't let other "True" values corrupt the state
	return passed
Ejemplo n.º 12
0
 def test_parse_url_negative_port(self):
     with pytest.raises(LocationParseError):
         parse_url("https://www.google.com:-80/")
Ejemplo n.º 13
0
) -> str:
    """Try to request the login page to see if the URL is valid."""
    import requests
    from requests.exceptions import SSLError
    from urllib3.util.url import parse_url

    kwargs: Dict[str, Any] = {
        "timeout": timeout,
        "verify": Options.ca_bundle or not Options.ssl_no_verify,
        "cert": client_certificate(),
        "headers": {
            "User-Agent": user_agent()
        },
    }
    try:
        parse_url(url)
        log.debug(f"Testing URL {url!r}")
        full_url = f"{url}/{login_page}"
        if proxy:
            kwargs["proxies"] = proxy.settings(url=full_url)
        with requests.get(full_url, **kwargs) as resp:
            resp.raise_for_status()
            if resp.status_code == 200:  # Happens when JSF is installed
                log.debug(f"Valid URL: {url}")
                return ""
    except SSLError as exc:
        if "CERTIFICATE_VERIFY_FAILED" in str(exc):
            raise InvalidSSLCertificate()
        elif "CERTIFICATE_REQUIRED" in str(exc):
            raise MissingClientSSLCertificate()
        elif "password is required" in str(exc):
Ejemplo n.º 14
0
 def test_parse_url_invalid_IPv6(self):
     with pytest.raises(ValueError):
         parse_url('[::1')
Ejemplo n.º 15
0
 def test_parse_url(self, url, expected_url):
     returned_url = parse_url(url)
     assert returned_url == expected_url
Ejemplo n.º 16
0
 def test_parse_url_invalid_IPv6(self):
     with pytest.raises(LocationParseError):
         parse_url("[::1")
Ejemplo n.º 17
0
 def test_request_uri(self, url: str, expected_request_uri: str) -> None:
     returned_url = parse_url(url)
     assert returned_url.request_uri == expected_request_uri
Ejemplo n.º 18
0
 def test_invalid_url(self, url):
     with pytest.raises(LocationParseError):
         parse_url(url)
Ejemplo n.º 19
0
 def test_url_vulnerabilities(self, url, expected_url):
     if expected_url is False:
         with pytest.raises(LocationParseError):
             parse_url(url)
     else:
         assert parse_url(url) == expected_url
Ejemplo n.º 20
0
 def test_parse_url_without_idna(self):
     url = "http://\uD7FF.com"
     with pytest.raises(LocationParseError,
                        match=f"Failed to parse: {url}"):
         parse_url(url)
Ejemplo n.º 21
0
    def prepare_options(self):
        if is_blank(self._url):
            raise Exception("GoogleArtsCrawlerOption , url is blank!")
        uprs = parse_url(url=self._url)
        if not uprs.host == 'artsandculture.google.com':
            raise Exception("GoogleArtsCrawlerOption, url netloc is not `artsandculture.google.com`")
        self._url = "https://{0}{1}".format(uprs.host, uprs.path)

        # download webdriver
        if self._webdriver_execute_path is None and self._need_download_webdrive:
            default_webdrive_path = "webdriver"
            if os.path.isdir(default_webdrive_path):
                default_webdrive_files = os.listdir(default_webdrive_path)
                if len(default_webdrive_files) > 0:
                    default_webdrive_execute_file = os.path.join(default_webdrive_path, default_webdrive_files[0])
                    if os.path.isfile(default_webdrive_execute_file):
                        print("==> webdriver has exist at {0}".format(default_webdrive_execute_file))
                        self._webdriver_execute_path = default_webdrive_execute_file

            if self._webdriver_execute_path is None:
                if WINDOWS:
                    os_name = "Windows"
                    webdriver_download_url = "http://chromedriver.storage.googleapis.com/78.0.3904.70/chromedriver_win32.zip"
                elif DARWIN:
                    os_name = "Mac OS"
                    webdriver_download_url = "http://chromedriver.storage.googleapis.com/78.0.3904.70/chromedriver_mac64.zip"
                elif LINUX:
                    os_name = "Linux"
                    webdriver_download_url = "http://chromedriver.storage.googleapis.com/78.0.3904.70/chromedriver_linux64.zip"
                else:
                    raise Exception("GoogleArtsCrawlerOptions, unknown platform !")
                print("==> current operation system : {0}".format(os_name))
                print("==> prepare download webdriver : {0}".format(webdriver_download_url))
                default_download_tmp = "tmp"
                webdriver_zip_filename = webdriver_download_url.split("/")[-1]
                webdriver_local_zip_filepath = os.path.join(default_download_tmp, webdriver_zip_filename)

                # not exist
                if not os.path.isfile(webdriver_local_zip_filepath):
                    http = SOCKSProxyManager('socks5://localhost:1086/')
                    # http = PoolManager()
                    response = http.request('GET', webdriver_download_url, preload_content=False)
                    if not os.path.isdir(default_download_tmp):
                        os.mkdir(default_download_tmp)
                    with open(webdriver_local_zip_filepath, mode="wb") as fd:
                        while True:
                            data = response.read(1024)
                            if not data:
                                break
                            fd.write(data)
                    response.release_conn()
                    print("==> webdriver zip file download finished , location at : {0}".format(
                        os.path.abspath(webdriver_local_zip_filepath)))
                else:
                    print("==> webdriver zip file has existed at {0}".format(webdriver_local_zip_filepath))
                with ZipFile(webdriver_local_zip_filepath, 'r') as zipfile:
                    zipfile.extractall(path=default_webdrive_path)

                if self._need_clear_cache:
                    shutil.rmtree(default_download_tmp)
                self._webdriver_execute_path = os.path.join(default_webdrive_path, os.listdir(default_webdrive_path)[0])

        if is_blank(self._webdriver_execute_path):
            raise Exception("GoogleArtsCrawlerOption , webdriver_execute_path is blank!")
        if not os.path.isfile(self._webdriver_execute_path):
            raise Exception("GoogleArtsCrawlerOption , webdriver_execute_path is not exist, this is file!")

        if LINUX or DARWIN:
            os.chmod(self._webdriver_execute_path, 0o777)

        # self._chrome_options.binary_location = os.path.abspath(self._webdriver_execute_path)

        mobile_emulation = {
            "deviceMetrics": {"width": self._size, "height": self._size, "pixelRatio": 1.0},
            "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 "
                         "(KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
        self._chrome_options.add_experimental_option("mobileEmulation", mobile_emulation)
        self._chrome_options.add_argument('--no-sandbox')
        self._chrome_options.add_argument('--disable-dev-shm-usage')
        self._chrome_options.add_argument('--disable-gpu')
        self._chrome_options.add_argument("--disable-dev-shm-usage")
        self._chrome_options.add_argument("start-maximized")
        self._chrome_options.add_argument("disable-infobars")
        self._chrome_options.add_argument("--disable-extensions")
        if not self._is_debug:
            self._chrome_options.add_argument("--headless")

        self._output_path = DEFAULT_GCO_OUTPUT_PATH if self._output_path is None else self._output_path
        self._size = DEFAULT_GCO_SIZE if self._size is None or self._size < 1 else self._size
        self._init_delay_time = DEFAULT_GCO_INIT_DELAY if self._init_delay_time is None or self._init_delay_time < 1 else self._init_delay_time

        if not os.path.isdir(self._output_path):
            os.makedirs(self._output_path)
        if not os.path.isdir(self._partial_tmp_path):
            os.makedirs(self._partial_tmp_path)
        if self._is_debug:
            print("GoogleArtsCrawlerOptions:")
            print("==> url:{0}".format(self._url))
            print("==> webdriver_execute_path:{0}".format(os.path.abspath(self._webdriver_execute_path)))
            print("==> output :{0}".format(os.path.abspath(self._output_path)))

        return self
Ejemplo n.º 22
0
from sys import argv
from urllib3.util.url import parse_url
from pyperclip import paste
from path import Path,getcwdu
from sh import wget
import tarfile

target = ''
try:
    if len(argv) == 2:
        target = argv[1]
    else:
        target = paste()
except Exception:
    print('no arguments or clipboard contents')
if not Path(target).exists():
    wget(target, O='temporary_tar')
    maybeU = parse_url(target)
    maybeF = maybeU.path.split('/')[-1]
    targetF = getcwdu() + '/' + maybeF.split('.')[0]
else:
    Path(target).move('temporary_tar')
    targetF = target.split('/')[-1]
    targetF = getcwdu() + '/' + targetF.split('.')[0]
with tarfile.open('temporary_tar') as tf:
    tf.extractall()
Path('temporary_tar').remove()
print(Path(targetF), end='')
Ejemplo n.º 23
0
 def test_parse_url_bytes_type_error(self) -> None:
     with pytest.raises(TypeError):
         parse_url(b"https://www.google.com/")  # type: ignore[arg-type]
Ejemplo n.º 24
0
 def test_netloc(self, url: str, expected_netloc: Optional[str]) -> None:
     assert parse_url(url).netloc == expected_netloc
Ejemplo n.º 25
0
 def test_authority(self, url: str,
                    expected_authority: Optional[str]) -> None:
     assert parse_url(url).authority == expected_authority
Ejemplo n.º 26
0
 def test_request_uri(self, url, expected_request_uri):
     returned_url = parse_url(url)
     assert returned_url.request_uri == expected_request_uri
Ejemplo n.º 27
0
 def test_parse_url(self, url: str, expected_url: Url) -> None:
     returned_url = parse_url(url)
     assert returned_url == expected_url
     assert returned_url.hostname == returned_url.host == expected_url.host
Ejemplo n.º 28
0
 def test_netloc(self, url, expected_netloc):
     assert parse_url(url).netloc == expected_netloc
Ejemplo n.º 29
0
 def test_parse_and_normalize_url_paths(self, url: str,
                                        expected_url: Url) -> None:
     actual_url = parse_url(url)
     assert actual_url == expected_url
     assert actual_url.url == expected_url.url
Ejemplo n.º 30
0
 def test_parse_url_bytes_type_error(self):
     with pytest.raises(TypeError):
         parse_url(b"https://www.google.com/")
Ejemplo n.º 31
0
 def get_client(self, device_id, secret):
     client = LocalClient(device_id, secret)
     parsed_url = parse_url(self.liveserver.url)
     client.host = parsed_url.host
     client.port = parsed_url.port
     return client
Ejemplo n.º 32
0
 def test_parse_url(self):
     for url, expected_Url in chain(self.parse_url_host_map,
                                    self.non_round_tripping_parse_url_host_map.items()):
         returned_Url = parse_url(url)
         self.assertEqual(returned_Url, expected_Url)
Ejemplo n.º 33
0
def mongo_start(url, text):
    host = parse_url(url).host.split('.')[1]
    client = MongoClient('127.0.0.1', 27017)
    db = client['shrink_db'] 
    summaries = db.host
    return fetch_data(url, summaries, text)
Ejemplo n.º 34
0
 def test_parse_url_normalization(self, url, expected_normalized_url):
     """Assert parse_url normalizes the scheme/host, and only the scheme/host"""
     actual_normalized_url = parse_url(url).url
     assert actual_normalized_url == expected_normalized_url
Ejemplo n.º 35
0
 def test_parse_url_invalid_IPv6(self):
     with pytest.raises(LocationParseError):
         parse_url('[::1')
Ejemplo n.º 36
0
 def test_parse_url_normalization(self, url, expected_normalized_url):
     """Assert parse_url normalizes the scheme/host, and only the scheme/host"""
     actual_normalized_url = parse_url(url).url
     assert actual_normalized_url == expected_normalized_url
Ejemplo n.º 37
0
 def test_request_uri(self, url, expected_request_uri):
     returned_url = parse_url(url)
     assert returned_url.request_uri == expected_request_uri
Ejemplo n.º 38
0
 def test_parse_and_normalize_url_paths(self, url, expected_url):
     actual_url = parse_url(url)
     assert actual_url == expected_url
     assert actual_url.url == expected_url.url
Ejemplo n.º 39
0
	def __init__(self, username, password, url):
		self.username = username
		self.password = password
		self.url = parse_url(url)
		self.url = parse_url('https://'+self.url.host+'/rest/api/2/')
Ejemplo n.º 40
0
 def test_parse_url_invalid_IPv6(self):
     with pytest.raises(ValueError):
         parse_url('[::1')
Ejemplo n.º 41
0
def get_url(url):
    return (parse_url(url))
Ejemplo n.º 42
0
 def test_parse_url(self, url, expected_url):
     returned_url = parse_url(url)
     assert returned_url == expected_url
Ejemplo n.º 43
0
def verify_host(url, allowed):
    return parse_url(url).host in allowed
Ejemplo n.º 44
0
 def test_parse_url_negative_port(self):
     with pytest.raises(LocationParseError):
         parse_url("https://www.google.com:-80/")
Ejemplo n.º 45
0
 def test_url_vulnerabilities(self, url, expected_url):
     if expected_url is False:
         with pytest.raises(LocationParseError):
             parse_url(url)
     else:
         assert parse_url(url) == expected_url
Ejemplo n.º 46
0
 def test_netloc(self, url, expected_netloc):
     assert parse_url(url).netloc == expected_netloc
Ejemplo n.º 47
0
	def __make_url(self,endpoint):
		url = parse_url(self.url.scheme+"://"+self.url.host+self.url.path+endpoint).url
		return url
Ejemplo n.º 48
0
 def test_parse_url(self):
     for url, expected_Url in chain(
             self.parse_url_host_map.items(),
             self.non_round_tripping_parse_url_host_map.items()):
         returned_Url = parse_url(url)
         self.assertEqual(returned_Url, expected_Url)
Ejemplo n.º 49
0
 def test_parse_url(self, url, expected_url):
     returned_url = parse_url(url)
     assert returned_url == expected_url
     assert returned_url.hostname == returned_url.host == expected_url.host
Ejemplo n.º 50
0
 def test_parse_url_bytes_type_error_python_3(self):
     with pytest.raises(TypeError):
         parse_url(b"https://www.google.com/")
Ejemplo n.º 51
0
 def get_client(self, jwt_token=None):
     client = LocalClient(jwt_token)
     parsed_url = parse_url(self.liveserver.url)
     client.host = parsed_url.host
     client.port = parsed_url.port
     return client