def compareNormalizeUrl(urlA, urlB, raiseException=True): uvA = UrlValidator() uvB = UrlValidator() if not uvA.validate(urlA): if raiseException: raise Exception('Invalid urlA') else: return -1 if not uvB.validate(urlB): if raiseException: raise Exception('Invalid urlB') else: return 1 ucA = UrlCanonicalizer() ucB = UrlCanonicalizer() yourlA = ucA.canonicalizerValidator(uvA) yourlB = ucB.canonicalizerValidator(uvB) if yourlA < yourlB: return -1 elif yourlA > yourlB: return 1 else: return 0
def __getNormalizedUrl(self): yourl = self.urls[:] ret = [] for url in yourl: uv = UrlValidator() if uv.validate(url): uc = UrlCanonicalizer() ret.append(uc.canonicalizerValidator(uv)) else: ret.append(None) return ret
def canonicalizeUrl(self, url): uv = UrlValidator() if not uv.validate(url): raise Exception('invalid url') return self.canonicalizerValidator(uv)
def setUp(self): self.urlValidator = UrlValidator()
urls = [] line = infile.readline() while len(line) > 0: # take out next line characters if line.endswith('\n'): line = line[:-1] urls.append(line) line = infile.readline() # filter out empty strings urls = filter(lambda s: s.strip(), urls) # process each url for url in urls: # url valid uv = UrlValidator() isValid = uv.validate(url) # remove url in urls wo_url_in_urls = urls[:] wo_url_in_urls.remove(url) # initialize param normURL = None isSrcUnique = UrlComparator.isSourceUnique(url, wo_url_in_urls) isNormUnique = None if isValid: uc = UrlCanonicalizer() normURL = uc.canonicalizerValidator(uv) isNormUnique = UrlComparator.isNormalizeUnique(url, wo_url_in_urls, False)