def rewrite_urls(origin_url, urls): origin_pack = urlparse.urlsplit(origin_url) for u in urls: # kill breaks if u: u = re.sub("(\n|\t)", "", u) pack = urlparse.urlsplit(u) (scheme, netloc, path, query, fragment) = pack # try to rewrite scheme scheme = rewrite_scheme(pack.scheme) # rewrite netloc to include credentials if origin_pack.username and pack.hostname == origin_pack.hostname: netloc = assemble_netloc(origin_pack.username, origin_pack.password, pack.hostname, pack.port) # reassemble into url new_u = urlparse.urlunsplit((scheme, netloc, path, query, None)) # no scheme or netloc, it's a path on-site if not scheme and not netloc and (path or query): path_query = urlparse.urlunsplit(('', '', path, query, '')) new_u = urlparse.urljoin(origin_url, path_query) # quote spaces new_u = new_u.replace(" ", "%20") if new_u: yield new_u
def get_referer(url): (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url) path = os.path.dirname(path) return urlparse.urlunsplit((scheme, netloc, path, None, None))