def compareNormalizeUrl(urlA, urlB, raiseException=True):
        uvA = UrlValidator()
        uvB = UrlValidator()

        if not uvA.validate(urlA):
            if raiseException:
                raise Exception('Invalid urlA')
            else:
                return -1

        if not uvB.validate(urlB):
            if raiseException:
                raise Exception('Invalid urlB')
            else:
                return 1

        ucA = UrlCanonicalizer()
        ucB = UrlCanonicalizer()

        yourlA = ucA.canonicalizerValidator(uvA)
        yourlB = ucB.canonicalizerValidator(uvB)

        if yourlA < yourlB:
            return -1
        elif yourlA > yourlB:
            return 1
        else:
            return 0
Beispiel #2
0
    def __getNormalizedUrl(self):
        yourl = self.urls[:]
        ret = []
        for url in yourl:
            uv = UrlValidator()
            if uv.validate(url):
                uc = UrlCanonicalizer()
                ret.append(uc.canonicalizerValidator(uv))
            else:
                ret.append(None)

        return ret
    def canonicalizeUrl(self, url):
        uv = UrlValidator()
        if not uv.validate(url):
            raise Exception('invalid url')

        return self.canonicalizerValidator(uv)
Beispiel #4
0
 def setUp(self):
     self.urlValidator = UrlValidator()
Beispiel #5
0
 urls = []
 line = infile.readline()
 while len(line) > 0:
     # take out next line characters
     if line.endswith('\n'):
         line = line[:-1]
     urls.append(line)
     line = infile.readline()
 
 # filter out empty strings
 urls = filter(lambda s: s.strip(), urls)
 
 # process each url 
 for url in urls:
     # url valid
     uv = UrlValidator()
     isValid = uv.validate(url)
             
     # remove url in urls
     wo_url_in_urls = urls[:]
     wo_url_in_urls.remove(url)
     
     # initialize param
     normURL = None
     isSrcUnique = UrlComparator.isSourceUnique(url, wo_url_in_urls)
     isNormUnique = None
     
     if isValid:
         uc = UrlCanonicalizer()
         normURL = uc.canonicalizerValidator(uv)
         isNormUnique = UrlComparator.isNormalizeUnique(url, wo_url_in_urls, False)