def rewrite_urls(origin_url, urls): origin_pack = urlparse.urlsplit(origin_url) for u in urls: # kill breaks if u: u = re.sub("(\n|\t)", "", u) pack = urlparse.urlsplit(u) (scheme, netloc, path, query, fragment) = pack # try to rewrite scheme scheme = rewrite_scheme(pack.scheme) # rewrite netloc to include credentials if origin_pack.username and pack.hostname == origin_pack.hostname: netloc = assemble_netloc(origin_pack.username, origin_pack.password, pack.hostname, pack.port) # reassemble into url new_u = urlparse.urlunsplit((scheme, netloc, path, query, None)) # no scheme or netloc, it's a path on-site if not scheme and not netloc and (path or query): path_query = urlparse.urlunsplit(('', '', path, query, '')) new_u = urlparse.urljoin(origin_url, path_query) # quote spaces new_u = new_u.replace(" ", "%20") if new_u: yield new_u
def redirect_internal(self, url, fp, errcode, errmsg, headers, data): if os.environ.get("SILENT_REDIRECT"): return urllib.FancyURLopener.redirect_internal( self, url, fp, errcode, errmsg, headers, data) if 'location' in headers: newurl = headers['location'] elif 'uri' in headers: newurl = headers['uri'] #newurl = urlparse.urljoin(url, newurl) newurl = urlparse.urljoin(self.fetcher.url, newurl) raise ChangedUrlWarning(newurl)