Example #1
0
    def _findReferences(self, tag, attrs):
        '''
        This method finds references inside a document.
        '''
        if tag.lower() not in self._tagsContainingURLs:
            return

        for attr_name, attr_val in attrs:
            if attr_name.lower() in self._urlAttrs:
                
                # Only add it to the result of the current URL is not a fragment
                if attr_val and not attr_val.startswith('#'):
                    url = urlParser.urlJoin(self._baseUrl, attr_val)
                    url = self._decode_URL(url, self._encoding)
                    url = urlParser.normalizeURL(url)
                    if url not in self._parsed_URLs:
                        self._parsed_URLs.append(url)
                        self._tag_and_url.append((tag.lower(), url))
                        break
    def _regex_url_parse(self, httpResponse):
        '''
        Use regular expressions to find new URLs.
        
        @parameter httpResponse: The http response object that stores the response body and the URL.
        @return: None. The findings are stored in self._re_URLs.
        '''
        #url_regex = '((http|https):[A-Za-z0-9/](([A-Za-z0-9$_.+!*(),;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*(),;/?:@&~=%-]*))?)'
        url_regex = '((http|https)://([a-zA-Z0-9_:@\-\./]*?)/[^ \n\r\t"\'<>]*)'
        for url in re.findall(url_regex, httpResponse.getBody() ):
            # This try is here because the _decode_URL method raises an exception
            # whenever it fails to decode a url.
            try:
                decoded_url = self._decode_URL(url[0], self._encoding)
            except w3afException:
                pass
            else:
                self._re_URLs.append(decoded_url)
        
        #
        # Now detect some relative URL's ( also using regexs )
        #
        def find_relative( doc ):
            res = []
            
            # TODO: Also matches //foo/bar.txt and http://host.tld/foo/bar.txt
            # I'm removing those matches manually below
            regex = '((:?[/]{1,2}[A-Z0-9a-z%_\-~\.]+)+\.[A-Za-z0-9]{2,4}(((\?)([a-zA-Z0-9]*=\w*)){1}((&)([a-zA-Z0-9]*=\w*))*)?)'
            relative_regex = re.compile( regex )
            
            for match_tuple in relative_regex.findall(doc):
                
                match_string = match_tuple[0]
                
                #
                #   And now I filter out some of the common false positives
                #
                if match_string.startswith('//'):
                    continue
                    
                if match_string.startswith('://'):
                    continue

                if re.match('HTTP/\d\.\d', match_string):
                    continue
                
                # Matches "PHP/5.2.4-2ubuntu5.7" , "Apache/2.2.8", and "mod_python/3.3.1"
                if re.match('.*?/\d\.\d\.\d', match_string):
                    continue
                #
                #   Filter finished.
                #
                    
                domainPath = urlParser.getDomainPath(httpResponse.getURL())
                url = urlParser.urlJoin( domainPath , match_string )
                url = self._decode_URL(url, self._encoding)
                res.append( url )
            
            return res
        
        relative_URLs = find_relative( httpResponse.getBody() )
        self._re_URLs.extend( relative_URLs )
        self._re_URLs = [ urlParser.normalizeURL(i) for i in self._re_URLs ]
        self._re_URLs = list(set(self._re_URLs))