def compareNormalizeUrl(urlA, urlB, raiseException=True):
        uvA = UrlValidator()
        uvB = UrlValidator()

        if not uvA.validate(urlA):
            if raiseException:
                raise Exception('Invalid urlA')
            else:
                return -1

        if not uvB.validate(urlB):
            if raiseException:
                raise Exception('Invalid urlB')
            else:
                return 1

        ucA = UrlCanonicalizer()
        ucB = UrlCanonicalizer()

        yourlA = ucA.canonicalizerValidator(uvA)
        yourlB = ucB.canonicalizerValidator(uvB)

        if yourlA < yourlB:
            return -1
        elif yourlA > yourlB:
            return 1
        else:
            return 0
 def compareNormalizeUrl(urlA, urlB, raiseException=True):
     uvA = UrlValidator()
     uvB = UrlValidator()
     
     if not uvA.validate(urlA):
         if raiseException:
             raise Exception('Invalid urlA')
         else:
             return -1
     
     if not uvB.validate(urlB):
         if raiseException:
             raise Exception('Invalid urlB')
         else:
             return 1
     
     ucA = UrlCanonicalizer()
     ucB = UrlCanonicalizer()
     
     yourlA = ucA.canonicalizerValidator(uvA)
     yourlB = ucB.canonicalizerValidator(uvB)
     
     if yourlA < yourlB:
         return -1
     elif yourlA > yourlB:
         return 1
     else:
         return 0
Beispiel #3
0
    def __getNormalizedUrl(self):
        yourl = self.urls[:]
        ret = []
        for url in yourl:
            uv = UrlValidator()
            if uv.validate(url):
                uc = UrlCanonicalizer()
                ret.append(uc.canonicalizerValidator(uv))
            else:
                ret.append(None)

        return ret
 def __getNormalizedUrl(self):
     yourl = self.urls[:]
     ret = []
     for url in yourl:
         uv = UrlValidator()
         if uv.validate(url):
             uc = UrlCanonicalizer()
             ret.append(uc.canonicalizerValidator(uv))
         else:
             ret.append(None)
         
     return ret
    def canonicalizeUrl(self, url):
        uv = UrlValidator()
        if not uv.validate(url):
            raise Exception('invalid url')

        return self.canonicalizerValidator(uv)
Beispiel #6
0
class UrlValidatorTest(TestCase):
    def setUp(self):
        self.urlValidator = UrlValidator()

    def test_wikiexample(self):
        urls = ['http://en.wikipedia.org/wiki/Unit_testing#Unit_testing_limitations',\
                'http://en.wikipedia.org/wiki/Unit_testing#Language-']

        for url in urls:
            self.assertTrue(self.urlValidator.validate(url))

    # check param input
    def test_illegalinput(self):
        with self.assertRaises(AssertionError) as err:
            self.urlValidator.validate(None)

        with self.assertRaises(AssertionError) as err:
            self.urlValidator.validate(123)

    # check empty string
    def test_emptystring(self):
        self.assertFalse(self.urlValidator.validate(''))

    # scheme
    def test_correct_incorrect_scheme(self):
        correct_list = ['http://www.google.com', \
                        'ftp://www.google.com', \
                        'www.google.com']
        for url in correct_list:
            self.assertTrue(self.urlValidator.validate(url),
                            'fail at url = ' + url)

        incorrect_list = ['htp://www.google.com', \
                        '://www.google.com', \
                        'http:/www.google.com', \
                        'http//www.google.com']
        for url in incorrect_list:
            self.assertFalse(self.urlValidator.validate(url),
                             'fail at url = ' + url)

    # uname pword
    def test_correct_incorrect_usernamepassword(self):
        correct_list = ['http://*****:*****@www.google.com', \
                        'http://www.google.com']
        for url in correct_list:
            self.assertTrue(self.urlValidator.validate(url),
                            'fail at url = ' + url)

        incorrect_list = ['http://[email protected]', \
                          'http://*****:*****@www.google.com', \
                          'http://@www.google.com', \
                          'http://@@www.google.com']
        for url in incorrect_list:
            self.assertFalse(self.urlValidator.validate(url),
                             'fail at url = ' + url)

    # dname
    def test_correct_incorrect_domainname(self):
        correct_list = ['http://google.com', \
                        'http://cs.washington.edu/path', \
                        'http://555.com']
        for url in correct_list:
            self.assertTrue(self.urlValidator.validate(url),
                            'fail at url = ' + url)

        incorrect_list = ['http://.com', \
                          'http://*****:*****@nba.com', \
                          'http://www.google.com/images%', \
                          'http://www.google.com/%2x/']
        for url in incorrect_list:
            self.assertFalse(self.urlValidator.validate(url))

    # query
    def test_correct_incorrect_query(self):
        correct_list = ['http://127.0.0.1:8000/url//nba/videos///?nba=cool', \
                        'http://127.0.0.1:8000/%2b?key=val1&key2=val2;key3=3#frag', \
                        'http://www.google.com:80/path/', \
                        'http://www.google.com:80/path/?', \
                        'http://www.google.com:80/path?']
        for url in correct_list:
            self.assertTrue(self.urlValidator.validate(url),
                            'fail at url = ' + url)

        incorrect_list = ['http://www.google.com?nba', \
                          'http://www.google.com/??', \
                          'http://www.google.com/?cmm = cmm', \
                          'http://www.google.com/?key==value', \
                          'http://www.google.com/?1a=1b']
        for url in incorrect_list:
            self.assertFalse(self.urlValidator.validate(url))

    # fragment
    def test_correct_incorrect_fragment(self):
        correct_list = ['http://127.0.0.1:8000/url//nba/videos///?nba=cool#fragment', \
                        'http://127.0.0.1:8000/%2b?key=val1&key2=val2;key3=3#_-_', \
                        'http://127.0.0.1:8000/%2b?key=val1&key2=val2;key3=3#', \
                        'http://www.google.com:80/path/']
        for url in correct_list:
            self.assertTrue(self.urlValidator.validate(url),
                            'fail at url = ' + url)

        incorrect_list = ['http://www.google.com?nba#wrong fragment', \
                          'http://www.google.com/##']
        for url in incorrect_list:
            self.assertFalse(self.urlValidator.validate(url))
Beispiel #7
0
class UrlValidatorTest(TestCase):
    # setup urlvalidator
    def setUp(self):
        self.urlValidator = UrlValidator()
    
    # wiki example validation, expect true
    def test_wikiexample(self):
        urls = ['http://en.wikipedia.org/wiki/Unit_testing#Unit_testing_limitations',\
                'http://en.wikipedia.org/wiki/Unit_testing#Language-']
        
        for url in urls:
            self.assertTrue(self.urlValidator.validate(url))
    
    # check param input
    def test_illegalinput(self):
        with self.assertRaises(AssertionError) as err:
            self.urlValidator.validate(None)
            
        with self.assertRaises(AssertionError) as err:
            self.urlValidator.validate(123)
            
    # check empty string
    def test_emptystring(self):
        self.assertFalse(self.urlValidator.validate(''))
    
    # scheme
    def test_correct_incorrect_scheme(self):
        correct_list = ['http://www.google.com', \
                        'ftp://www.google.com', \
                        'www.google.com']    
        for url in correct_list:
            self.assertTrue(self.urlValidator.validate(url), 'fail at url = ' + url)
            
        incorrect_list = ['htp://www.google.com', \
                        '://www.google.com', \
                        'http:/www.google.com', \
                        'http//www.google.com']
        for url in incorrect_list:
            self.assertFalse(self.urlValidator.validate(url), 'fail at url = ' + url)
            
    # uname pword      
    def test_correct_incorrect_usernamepassword(self):
        correct_list = ['http://*****:*****@www.google.com', \
                        'http://www.google.com']
        for url in correct_list:
            self.assertTrue(self.urlValidator.validate(url), 'fail at url = ' + url)
            
        incorrect_list = ['http://[email protected]', \
                          'http://*****:*****@www.google.com', \
                          'http://@www.google.com', \
                          'http://@@www.google.com']
        for url in incorrect_list:
            self.assertFalse(self.urlValidator.validate(url), 'fail at url = ' + url)
    
    # dname
    def test_correct_incorrect_domainname(self):
        correct_list = ['http://google.com', \
                        'http://cs.washington.edu/path', \
                        'http://555.com']
        for url in correct_list:
            self.assertTrue(self.urlValidator.validate(url), 'fail at url = ' + url)
            
        incorrect_list = ['http://.com', \
                          'http://*****:*****@nba.com', \
                          'http://www.google.com/images%', \
                          'http://www.google.com/%2x/']
        for url in incorrect_list:
            self.assertFalse(self.urlValidator.validate(url)) 
            
    # query
    def test_correct_incorrect_query(self):
        correct_list = ['http://127.0.0.1:8000/url//nba/videos///?nba=cool', \
                        'http://127.0.0.1:8000/%2b?key=val1&key2=val2;key3=3#frag', \
                        'http://www.google.com:80/path/', \
                        'http://www.google.com:80/path/?', \
                        'http://www.google.com:80/path?']
        for url in correct_list:
            self.assertTrue(self.urlValidator.validate(url), 'fail at url = ' + url)  
            
        incorrect_list = ['http://www.google.com?nba', \
                          'http://www.google.com/??', \
                          'http://www.google.com/?cmm = cmm', \
                          'http://www.google.com/?key==value', \
                          'http://www.google.com/?1a=1b']
        for url in incorrect_list:
            self.assertFalse(self.urlValidator.validate(url)) 
            
    # fragment
    def test_correct_incorrect_fragment(self):
        correct_list = ['http://127.0.0.1:8000/url//nba/videos///?nba=cool#fragment', \
                        'http://127.0.0.1:8000/%2b?key=val1&key2=val2;key3=3#_-_', \
                        'http://127.0.0.1:8000/%2b?key=val1&key2=val2;key3=3#', \
                        'http://www.google.com:80/path/']
        for url in correct_list:
            self.assertTrue(self.urlValidator.validate(url), 'fail at url = ' + url)  
            
        incorrect_list = ['http://www.google.com?nba#wrong fragment', \
                          'http://www.google.com/##']
        for url in incorrect_list:
            self.assertFalse(self.urlValidator.validate(url)) 
Beispiel #8
0
 line = infile.readline()
 while len(line) > 0:
     # take out next line characters
     if line.endswith('\n'):
         line = line[:-1]
     urls.append(line)
     line = infile.readline()
 
 # filter out empty strings
 urls = filter(lambda s: s.strip(), urls)
 
 # process each url 
 for url in urls:
     # url valid
     uv = UrlValidator()
     isValid = uv.validate(url)
             
     # remove url in urls
     wo_url_in_urls = urls[:]
     wo_url_in_urls.remove(url)
     
     # initialize param
     normURL = None
     isSrcUnique = UrlComparator.isSourceUnique(url, wo_url_in_urls)
     isNormUnique = None
     
     if isValid:
         uc = UrlCanonicalizer()
         normURL = uc.canonicalizerValidator(uv)
         isNormUnique = UrlComparator.isNormalizeUnique(url, wo_url_in_urls, False)