Python normalizeURL Examples

Programming Language: Python

Namespace/Package Name: core.data.parsers.urlParser

Method/Function: normalizeURL

Examples at hotexamples.com: 2

Python normalizeURL - 2 examples found. These are the top rated real world Python examples of core.data.parsers.urlParser.normalizeURL extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: sgmlParser.py Project: DavisHevin/sqli_benchmark

    def _findReferences(self, tag, attrs):
        '''
        This method finds references inside a document.
        '''
        if tag.lower() not in self._tagsContainingURLs:
            return

        for attr_name, attr_val in attrs:
            if attr_name.lower() in self._urlAttrs:
                
                # Only add it to the result of the current URL is not a fragment
                if attr_val and not attr_val.startswith('#'):
                    url = urlParser.urlJoin(self._baseUrl, attr_val)
                    url = self._decode_URL(url, self._encoding)
                    url = urlParser.normalizeURL(url)
                    if url not in self._parsed_URLs:
                        self._parsed_URLs.append(url)
                        self._tag_and_url.append((tag.lower(), url))
                        break

Example #2

Show file

File: abstractParser.py Project: DavisHevin/sqli_benchmark

    def _regex_url_parse(self, httpResponse):
        '''
        Use regular expressions to find new URLs.
        
        @parameter httpResponse: The http response object that stores the response body and the URL.
        @return: None. The findings are stored in self._re_URLs.
        '''
        #url_regex = '((http|https):[A-Za-z0-9/](([A-Za-z0-9$_.+!*(),;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*(),;/?:@&~=%-]*))?)'
        url_regex = '((http|https)://([a-zA-Z0-9_:@\-\./]*?)/[^ \n\r\t"\'<>]*)'
        for url in re.findall(url_regex, httpResponse.getBody() ):
            # This try is here because the _decode_URL method raises an exception
            # whenever it fails to decode a url.
            try:
                decoded_url = self._decode_URL(url[0], self._encoding)
            except w3afException:
                pass
            else:
                self._re_URLs.append(decoded_url)
        
        #
        # Now detect some relative URL's ( also using regexs )
        #
        def find_relative( doc ):
            res = []
            
            # TODO: Also matches //foo/bar.txt and http://host.tld/foo/bar.txt
            # I'm removing those matches manually below
            regex = '((:?[/]{1,2}[A-Z0-9a-z%_\-~\.]+)+\.[A-Za-z0-9]{2,4}(((\?)([a-zA-Z0-9]*=\w*)){1}((&)([a-zA-Z0-9]*=\w*))*)?)'
            relative_regex = re.compile( regex )
            
            for match_tuple in relative_regex.findall(doc):
                
                match_string = match_tuple[0]
                
                #
                #   And now I filter out some of the common false positives
                #
                if match_string.startswith('//'):
                    continue
                    
                if match_string.startswith('://'):
                    continue

                if re.match('HTTP/\d\.\d', match_string):
                    continue
                
                # Matches "PHP/5.2.4-2ubuntu5.7" , "Apache/2.2.8", and "mod_python/3.3.1"
                if re.match('.*?/\d\.\d\.\d', match_string):
                    continue
                #
                #   Filter finished.
                #
                    
                domainPath = urlParser.getDomainPath(httpResponse.getURL())
                url = urlParser.urlJoin( domainPath , match_string )
                url = self._decode_URL(url, self._encoding)
                res.append( url )
            
            return res
        
        relative_URLs = find_relative( httpResponse.getBody() )
        self._re_URLs.extend( relative_URLs )
        self._re_URLs = [ urlParser.normalizeURL(i) for i in self._re_URLs ]
        self._re_URLs = list(set(self._re_URLs))