Beispiel #1
0
 def _extract_links_and_verify(self, resp, fuzzable_req):
     #
     # Note: I WANT to follow links that are in the 404 page.
     #
     
     # Modified when I added the pdfParser
     # I had to add this x OR y stuff, just because I don't want 
     # the SGML parser to analyze a image file, its useless and
     # consumes CPU power.
     if resp.is_text_or_html() or resp.is_pdf() or resp.is_swf():
         originalURL = resp.getRedirURI()
         try:
             doc_parser = dpCache.dpc.getDocumentParserFor(resp)
         except w3afException, w3:
             om.out.debug('Failed to find a suitable document parser. '
                          'Exception "%s"' % w3)
         else:
             # Note:
             # - With parsed_refs I'm 100% that it's really 
             # something in the HTML that the developer intended to add.
             #
             # - The re_refs are the result of regular expressions,
             # which in some cases are just false positives.
             
             parsed_refs, re_refs = doc_parser.getReferences()
             
             # I also want to analyze all directories, if the URL I just
             # fetched is:
             # http://localhost/a/b/c/f00.php I want to GET:
             # http://localhost/a/b/c/
             # http://localhost/a/b/
             # http://localhost/a/
             # http://localhost/
             # And analyze the responses...
             dirs = resp.getURL().getDirectories()
             only_re_refs = set(re_refs) - set(dirs + parsed_refs)
             
             for ref in unique_justseen(
                            sorted( itertools.chain(dirs, parsed_refs, re_refs) )):
                 
                 # I don't want w3af sending requests to 3rd parties!
                 if ref.getDomain() != self._target_domain:
                     continue
                 
                 # Filter the URL's according to the configured regexs
                 urlstr = ref.url_string
                 if not self._compiled_follow_re.match(urlstr) or \
                     self._compiled_ignore_re.match(urlstr):
                     continue
                 
                 # Work with the parsed references and report broken
                 # links. Then work with the regex references and DO NOT
                 # report broken links
                 if self._need_more_variants(ref):
                     self._known_variants.append(ref)
                     possibly_broken = ref in only_re_refs
                     args = (ref, fuzzable_req, originalURL,
                              possibly_broken)
                     self._run_async(meth=self._verify_reference, args=args)
             self._join()
Beispiel #2
0
 def end(self):
     '''
     Called when the process ends, prints out the list of broken links.
     '''
     if len(self._broken_links):
         
         om.out.information('The following is a list of broken links that '
                            'were found by the webSpider plugin:')
         for broken, where in unique_justseen(self._broken_links.ordered_iter()):
             om.out.information('- %s [ referenced from: %s ]' %
                                (broken, where))
Beispiel #3
0
    def end(self):
        """
        Called when the process ends, prints out the list of broken links.
        """
        if len(self._broken_links):

            om.out.information("The following is a list of broken links that " "were found by the web_spider plugin:")
            for broken, where in unique_justseen(self._broken_links.ordered_iter()):
                om.out.information("- %s [ referenced from: %s ]" % (broken, where))

        self._broken_links.cleanup()
Beispiel #4
0
    def end(self):
        '''
        Called when the process ends, prints out the list of broken links.
        '''
        if len(self._broken_links):

            om.out.information('The following is a list of broken links that '
                               'were found by the web_spider plugin:')
            for broken, where in unique_justseen(
                    self._broken_links.ordered_iter()):
                om.out.information('- %s [ referenced from: %s ]' %
                                   (broken, where))

        self._broken_links.cleanup()
Beispiel #5
0
    def _urls_to_verify_generator(self, resp, fuzzable_req):
        '''
        :param resp: HTTP response object
        :param fuzzable_req: The HTTP request that generated the response
        '''
        #
        # Note: I WANT to follow links that are in the 404 page.
        #

        # Modified when I added the PDFParser
        # I had to add this x OR y stuff, just because I don't want
        # the SGML parser to analyze a image file, its useless and
        # consumes CPU power.
        if resp.is_text_or_html() or resp.is_pdf() or resp.is_swf():
            original_url = resp.get_redir_uri()
            try:
                doc_parser = parser_cache.dpc.get_document_parser_for(resp)
            except w3afException, w3:
                om.out.debug('Failed to find a suitable document parser. '
                             'Exception "%s"' % w3)
            else:
                # Note:
                # - With parsed_refs I'm 100% that it's really
                # something in the HTML that the developer intended to add.
                #
                # - The re_refs are the result of regular expressions,
                # which in some cases are just false positives.

                parsed_refs, re_refs = doc_parser.get_references()

                # I also want to analyze all directories, if the URL I just
                # fetched is:
                # http://localhost/a/b/c/f00.php I want to GET:
                # http://localhost/a/b/c/
                # http://localhost/a/b/
                # http://localhost/a/
                # http://localhost/
                # And analyze the responses...
                dirs = resp.get_url().get_directories()
                only_re_refs = set(re_refs) - set(dirs + parsed_refs)

                all_refs = itertools.chain(dirs, parsed_refs, re_refs)

                for ref in unique_justseen(sorted(all_refs)):

                    # I don't want w3af sending requests to 3rd parties!
                    if ref.get_domain() != self._target_domain:
                        continue

                    # Filter the URL's according to the configured regexs
                    urlstr = ref.url_string
                    if not self._compiled_follow_re.match(urlstr) or \
                            self._compiled_ignore_re.match(urlstr):
                        continue

                    if self._only_forward:
                        if not self._is_forward(ref):
                            continue

                    # Work with the parsed references and report broken
                    # links. Then work with the regex references and DO NOT
                    # report broken links
                    if self._need_more_variants(ref):
                        self._known_variants.append(ref)
                        possibly_broken = ref in only_re_refs
                        yield ref, fuzzable_req, original_url, possibly_broken
Beispiel #6
0
    def _urls_to_verify_generator(self, resp, fuzzable_req):
        """
        :param resp: HTTP response object
        :param fuzzable_req: The HTTP request that generated the response
        """
        #
        # Note: I WANT to follow links that are in the 404 page.
        #

        # Modified when I added the PDFParser
        # I had to add this x OR y stuff, just because I don't want
        # the SGML parser to analyze a image file, its useless and
        # consumes CPU power.
        if resp.is_text_or_html() or resp.is_pdf() or resp.is_swf():
            original_url = resp.get_redir_uri()
            try:
                doc_parser = parser_cache.dpc.get_document_parser_for(resp)
            except w3afException, w3:
                om.out.debug("Failed to find a suitable document parser. " 'Exception "%s"' % w3)
            else:
                # Note:
                # - With parsed_refs I'm 100% that it's really
                # something in the HTML that the developer intended to add.
                #
                # - The re_refs are the result of regular expressions,
                # which in some cases are just false positives.

                parsed_refs, re_refs = doc_parser.get_references()

                # I also want to analyze all directories, if the URL I just
                # fetched is:
                # http://localhost/a/b/c/f00.php I want to GET:
                # http://localhost/a/b/c/
                # http://localhost/a/b/
                # http://localhost/a/
                # http://localhost/
                # And analyze the responses...
                dirs = resp.get_url().get_directories()
                only_re_refs = set(re_refs) - set(dirs + parsed_refs)

                all_refs = itertools.chain(dirs, parsed_refs, re_refs)

                for ref in unique_justseen(sorted(all_refs)):

                    # I don't want w3af sending requests to 3rd parties!
                    if ref.get_domain() != self._target_domain:
                        continue

                    # Filter the URL's according to the configured regexs
                    urlstr = ref.url_string
                    if not self._compiled_follow_re.match(urlstr) or self._compiled_ignore_re.match(urlstr):
                        continue

                    if self._only_forward:
                        if not self._is_forward(ref):
                            continue

                    # Work with the parsed references and report broken
                    # links. Then work with the regex references and DO NOT
                    # report broken links
                    if self._need_more_variants(ref):
                        self._known_variants.append(ref)
                        possibly_broken = ref in only_re_refs
                        yield ref, fuzzable_req, original_url, possibly_broken