def test_sourceUnique(self): url = 'www.google.com' list1 = ['google.com', 'http://google.com'] list2 = ['www.google.com', 'something.net'] self.assertTrue(UrlComparator.isSourceUnique(url, list1)) self.assertFalse(UrlComparator.isSourceUnique(url, list2))
def test_sourceUnique(self): url = 'www.google.com' list1 = ['google.com', 'http://google.com'] list2 = ['www.google.com', 'something.net'] self.assertTrue(UrlComparator.isSourceUnique(url, list1)) self.assertFalse(UrlComparator.isSourceUnique(url, list2))
# filter out empty strings urls = filter(lambda s: s.strip(), urls) # process each url for url in urls: # url valid uv = UrlValidator() isValid = uv.validate(url) # remove url in urls wo_url_in_urls = urls[:] wo_url_in_urls.remove(url) # initialize param normURL = None isSrcUnique = UrlComparator.isSourceUnique(url, wo_url_in_urls) isNormUnique = None if isValid: uc = UrlCanonicalizer() normURL = uc.canonicalizerValidator(uv) isNormUnique = UrlComparator.isNormalizeUnique(url, wo_url_in_urls, False) print 'Source: ' + url print 'Valid: ' + str(isValid) print 'Canonical: ' + ('None' if normURL == None else normURL) print 'Source unique: ' + str(isSrcUnique) print 'Canonicalized URL unique: ' + ('N/A' if isNormUnique == None else str(isNormUnique)) print ''