Ejemplo n.º 1
0
 def compareNormalizeUrl(urlA, urlB, raiseException=True):
     uvA = UrlValidator()
     uvB = UrlValidator()
     
     if not uvA.validate(urlA):
         if raiseException:
             raise Exception('Invalid urlA')
         else:
             return -1
     
     if not uvB.validate(urlB):
         if raiseException:
             raise Exception('Invalid urlB')
         else:
             return 1
     
     ucA = UrlCanonicalizer()
     ucB = UrlCanonicalizer()
     
     yourlA = ucA.canonicalizerValidator(uvA)
     yourlB = ucB.canonicalizerValidator(uvB)
     
     if yourlA < yourlB:
         return -1
     elif yourlA > yourlB:
         return 1
     else:
         return 0
Ejemplo n.º 2
0
    def __getNormalizedUrl(self):
        yourl = self.urls[:]
        ret = []
        for url in yourl:
            uv = UrlValidator()
            if uv.validate(url):
                uc = UrlCanonicalizer()
                ret.append(uc.canonicalizerValidator(uv))
            else:
                ret.append(None)

        return ret
Ejemplo n.º 3
0
 def __getNormalizedUrl(self):
     yourl = self.urls[:]
     ret = []
     for url in yourl:
         uv = UrlValidator()
         if uv.validate(url):
             uc = UrlCanonicalizer()
             ret.append(uc.canonicalizerValidator(uv))
         else:
             ret.append(None)
         
     return ret
Ejemplo n.º 4
0
 def test_removeWWWdot(self):
     urls = ['http://www.google.com//path//..///path////////////']
             
     expected = ['http://google.com/path/']
     
     for i in range(0,len(urls)):
         uc = UrlCanonicalizer()
         actual = uc.canonicalizeUrl(urls[i])
         self.assertEqual(expected[i], actual, \
                          'fail on url: ' + urls[i] + '\n' +\
                          'expected: ' + expected[i] + '\n' +\
                          'actual  : ' + actual)  
Ejemplo n.º 5
0
 def test_addtrailingslash(self):
     urls = ['http://google.com/path',\
             'http://*****:*****@en.wIkipediA.org:0/wiki/Unit_testing/%4f%4F#Language-']
     expected = ['http://google.com/path/',\
                 'http://en.wikipedia.org/wiki/Unit_testing/OO/']
     
     for i in range(0,len(urls)):
         uc = UrlCanonicalizer()
         actual = uc.canonicalizeUrl(urls[i])
         self.assertEqual(expected[i], actual, \
                          'fail on url: ' + urls[i] + '\n' +\
                          'expected: ' + expected[i] + '\n' +\
                          'actual  : ' + actual)       
Ejemplo n.º 6
0
 def test_removeUserPassword(self):
     urls = ['hunlan:[email protected]:80/hunlan%40gmail%2ecom',\
             'http://*****:*****@en.wIkipediA.org:0/wiki/Unit_testing/%4f%4F#Language-']
     expected = ['google.com/[email protected]/',\
                 'http://en.wikipedia.org/wiki/Unit_testing/OO/']
     
     for i in range(0,len(urls)):
         uc = UrlCanonicalizer()
         actual = uc.canonicalizeUrl(urls[i])
         self.assertEqual(expected[i], actual, \
                          'fail on url: ' + urls[i] + '\n' +\
                          'expected: ' + expected[i] + '\n' +\
                          'actual  : ' + actual)        
Ejemplo n.º 7
0
 def test_lowercaseHostName(self):
     urls = ['www.GoOgLE.com',\
             'http://en.wIkipediA.org/wiki/Unit_testing#Language-']
     expected = ['google.com/',\
                 'http://en.wikipedia.org/wiki/Unit_testing/']
     
     for i in range(0,len(urls)):
         uc = UrlCanonicalizer()
         actual = uc.canonicalizeUrl(urls[i])
         self.assertEqual(expected[i], actual, \
                          'fail on url: ' + urls[i] + '\n' +\
                          'expected: ' + expected[i] + '\n' +\
                          'actual  : ' + actual)
Ejemplo n.º 8
0
 def test_removeDupSlashes(self):
     urls = ['http://google.com//path//..///path////////////',\
             'http://*****:*****@en.wIkipediA.org:0//wiki/Unit_testing/%2e%2e#Language-']
     expected = ['http://google.com/path/',\
                 'http://en.wikipedia.org/wiki/']
     
     for i in range(0,len(urls)):
         uc = UrlCanonicalizer()
         actual = uc.canonicalizeUrl(urls[i])
         self.assertEqual(expected[i], actual, \
                          'fail on url: ' + urls[i] + '\n' +\
                          'expected: ' + expected[i] + '\n' +\
                          'actual  : ' + actual)       
Ejemplo n.º 9
0
 def test_wikiexample(self):
     urls = ['http://en.wikipedia.org/wiki/Unit_testing#Unit_testing_limitations',\
             'http://en.wikipedia.org/wiki/Unit_testing#Language-']
     expected = ['http://en.wikipedia.org/wiki/Unit_testing/',\
                 'http://en.wikipedia.org/wiki/Unit_testing/']
     
     for i in range(0,len(urls)):
         uc = UrlCanonicalizer()
         actual = uc.canonicalizeUrl(urls[i])
         self.assertEqual(expected[i], actual, \
                          'fail on url: ' + urls[i] + '\n' +\
                          'expected: ' + expected[i] + '\n' +\
                          'actual  : ' + actual)
Ejemplo n.º 10
0
 def test_decodePercentEncoding(self):
     urls = ['www.GoOgLE.com/hunlan%40gmail%2ecom',\
             'cs.washington.edu/%43%53%45%34%30%33',\
             'http://en.wIkipediA.org/wiki/Unit_testing/%4f%4F#Language-']
     expected = ['google.com/[email protected]/',\
                 'cs.washington.edu/CSE403/',\
                 'http://en.wikipedia.org/wiki/Unit_testing/OO/']
     
     for i in range(0,len(urls)):
         uc = UrlCanonicalizer()
         actual = uc.canonicalizeUrl(urls[i])
         self.assertEqual(expected[i], actual, \
                          'fail on url: ' + urls[i] + '\n' +\
                          'expected: ' + expected[i] + '\n' +\
                          'actual  : ' + actual)    
Ejemplo n.º 11
0
    def compareNormalizeUrl(urlA, urlB, raiseException=True):
        uvA = UrlValidator()
        uvB = UrlValidator()

        if not uvA.validate(urlA):
            if raiseException:
                raise Exception('Invalid urlA')
            else:
                return -1

        if not uvB.validate(urlB):
            if raiseException:
                raise Exception('Invalid urlB')
            else:
                return 1

        ucA = UrlCanonicalizer()
        ucB = UrlCanonicalizer()

        yourlA = ucA.canonicalizerValidator(uvA)
        yourlB = ucB.canonicalizerValidator(uvB)

        if yourlA < yourlB:
            return -1
        elif yourlA > yourlB:
            return 1
        else:
            return 0
Ejemplo n.º 12
0
 def test_sortAndUseAndSignForQuery(self):
     urls = ['www.nba.com?a=0;A=1;a=d',\
             'http://google.com//path//..///path////////////?b=2;a=1',\
             'http://*****:*****@en.wIkipediA.org:0//wiki/Unit_testing/%2e%2e?a=0;c=1&B=2#Language-']
     expected = ['nba.com/?a=0&a=1&a=d',\
                 'http://google.com/path/?a=1&b=2',\
                 'http://en.wikipedia.org/wiki/?a=0&b=2&c=1']
     
     for i in range(0,len(urls)):
         uc = UrlCanonicalizer()
         actual = uc.canonicalizeUrl(urls[i])
         self.assertEqual(expected[i], actual, \
                          'fail on url: ' + urls[i] + '\n' +\
                          'expected: ' + expected[i] + '\n' +\
                          'actual  : ' + actual) 
          
         
         
         
         
         
         
Ejemplo n.º 13
0
 for url in urls:
     # url valid
     uv = UrlValidator()
     isValid = uv.validate(url)
             
     # remove url in urls
     wo_url_in_urls = urls[:]
     wo_url_in_urls.remove(url)
     
     # initialize param
     normURL = None
     isSrcUnique = UrlComparator.isSourceUnique(url, wo_url_in_urls)
     isNormUnique = None
     
     if isValid:
         uc = UrlCanonicalizer()
         normURL = uc.canonicalizerValidator(uv)
         isNormUnique = UrlComparator.isNormalizeUnique(url, wo_url_in_urls, False)
         
     print 'Source: ' + url
     print 'Valid: ' + str(isValid)
     print 'Canonical: ' + ('None' if normURL == None else normURL)
     print 'Source unique: ' + str(isSrcUnique)
     print 'Canonicalized URL unique: ' + ('N/A' if isNormUnique == None else str(isNormUnique))
     
     print ''
 
 # clean up        
 infile.close()