def replace(self, arg, charset='utf-8'): #n = arg.rpartition("/")[0].count("/") self.host = urlparse.urljoin(self.host, arg.replace("//", "/")) this = '' try: this = lower(self.host.replace("www.", "")) if isinstance(this, unicode): this = this.encode(charset, 'ignore') except UnicodeEncodeError: pass #log scheme, netloc, path_A, qs, anchor = urlparse.urlsplit(this) path_B = urlnorm.norm_path("http", path_A) path = urllib.quote(path_B.encode('utf-8'), '/%') qs = urllib.quote_plus(qs.encode('utf8'), ':&?/=') split = urlparse.urlunsplit((scheme, netloc, lower(path), qs, anchor)) return split
def norm_url(url): url = uni(url).encode('utf-8') try: return urlnorm.norm(url) except urlnorm.InvalidUrl: # Happens when the URL is relative. Call path normalization directly. try: return urlnorm.norm_path('', url) except UnicodeDecodeError: return url except UnicodeDecodeError: # work around for bug in urlnorm on unicode url return url except: traceback.print_exc() return None
def test_norm_path(bad, good): output = urlnorm.norm_path("http", bad) assert output == _unicode(good)
def normurl(url): try: return urlnorm.norm(url) except urlnorm.InvalidUrl: return urlnorm.norm_path('', url)