def rewrite_urls(origin_url, urls): origin_pack = urlparse.urlsplit(origin_url) for u in urls: # kill breaks if u: u = re.sub("(\n|\t)", "", u) pack = urlparse.urlsplit(u) (scheme, netloc, path, query, fragment) = pack # try to rewrite scheme scheme = rewrite_scheme(pack.scheme) # rewrite netloc to include credentials if origin_pack.username and pack.hostname == origin_pack.hostname: netloc = assemble_netloc(origin_pack.username, origin_pack.password, pack.hostname, pack.port) # reassemble into url new_u = urlparse.urlunsplit((scheme, netloc, path, query, None)) # no scheme or netloc, it's a path on-site if not scheme and not netloc and (path or query): path_query = urlparse.urlunsplit(('', '', path, query, '')) new_u = urlparse.urljoin(origin_url, path_query) # quote spaces new_u = new_u.replace(" ", "%20") if new_u: yield new_u
def url_to_filename(url): (scheme, netloc, path, query, _) = urlparse.urlsplit(url) file = os.path.basename(path) if os.environ.get("ORIG_FILENAMES") == "1" and file: filename = file else: (path, ext) = os.path.splitext(path) filename = "_".join([x for x in (scheme, netloc, path, query) if x]) filename = re.sub("[^a-zA-Z0-9]", "_", filename) filename = re.sub("_{2,}", "_", filename) filename = re.sub("_$", "", filename) filename = filename + ext return filename
def get_scheme(url): pack = urlparse.urlsplit(url) return pack.scheme
def get_referer(url): (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url) path = os.path.dirname(path) return urlparse.urlunsplit((scheme, netloc, path, None, None))
def get_hostname(url): pack = urlparse.urlsplit(url) return pack.hostname