def __init__(self): CrawlPlugin.__init__(self) # Internal variables self._compiled_ignore_re = None self._compiled_follow_re = None self._broken_links = DiskSet() self._first_run = True self._known_variants = VariantDB() self._already_filled_form = ScalableBloomFilter() # User configured variables self._ignore_regex = '' self._follow_regex = '.*' self._only_forward = False self._compile_re()
def __init__(self): CrawlPlugin.__init__(self) # Internal variables self._compiled_ignore_re = None self._compiled_follow_re = None self._broken_links = DiskSet() self._first_run = True self._known_variants = VariantDB() self._already_filled_form = ScalableBloomFilter() # User configured variables self._ignore_regex = "" self._follow_regex = ".*" self._only_forward = False self._compile_re()
def __init__(self, crawl_infrastructure_plugins, w3af_core, max_discovery_time): ''' :param in_queue: The input queue that will feed the crawl_infrastructure plugins :param crawl_infrastructure_plugins: Instances of crawl_infrastructure plugins in a list :param w3af_core: The w3af core that we'll use for status reporting :param max_discovery_time: The max time (in seconds) to use for the discovery phase ''' super(crawl_infrastructure, self).__init__(crawl_infrastructure_plugins, w3af_core, thread_name='CrawlInfra') self._max_discovery_time = int(max_discovery_time) # For filtering fuzzable requests found by plugins: self._variant_db = VariantDB() self._already_seen_urls = ScalableBloomFilter() self._disabled_plugins = set() self._running = True self._report_max_time = True
class web_spider(CrawlPlugin): ''' Crawl the web application. :author: Andres Riancho ([email protected]) ''' NOT_404 = set([http_constants.UNAUTHORIZED, http_constants.FORBIDDEN]) def __init__(self): CrawlPlugin.__init__(self) # Internal variables self._compiled_ignore_re = None self._compiled_follow_re = None self._broken_links = DiskSet() self._first_run = True self._known_variants = VariantDB() self._already_filled_form = ScalableBloomFilter() # User configured variables self._ignore_regex = '' self._follow_regex = '.*' self._only_forward = False self._compile_re() def crawl(self, fuzzable_req): ''' Searches for links on the html. :param fuzzable_req: A fuzzable_req instance that contains (among other things) the URL to test. ''' if self._first_run: # I have to set some variables, in order to be able to code # the "only_forward" feature self._first_run = False self._target_urls = [ i.get_domain_path() for i in cf.cf.get('targets') ] # The following line triggered lots of bugs when the "stop" button # was pressed and the core did this: "cf.cf.save('targets', [])" #self._target_domain = cf.cf.get('targets')[0].get_domain() # Changing it to something awful but bug-free. targets = cf.cf.get('targets') if not targets: return else: self._target_domain = targets[0].get_domain() # # If it is a form, then smart_fill the parameters to send something that # makes sense and will allow us to cover more code. # if isinstance(fuzzable_req, HTTPPostDataRequest): if fuzzable_req.get_url() in self._already_filled_form: return fuzzable_req = self._fill_form(fuzzable_req) # Send the HTTP request, resp = self._uri_opener.send_mutant(fuzzable_req, follow_redir=False) # Nothing to do here... if resp.get_code() == 401: return fuzz_req_list = self._create_fuzzable_requests(resp, request=fuzzable_req, add_self=False) for fr in fuzz_req_list: self.output_queue.put(fr) self._extract_links_and_verify(resp, fuzzable_req) def _urls_to_verify_generator(self, resp, fuzzable_req): ''' :param resp: HTTP response object :param fuzzable_req: The HTTP request that generated the response ''' # # Note: I WANT to follow links that are in the 404 page. # # Modified when I added the PDFParser # I had to add this x OR y stuff, just because I don't want # the SGML parser to analyze a image file, its useless and # consumes CPU power. if resp.is_text_or_html() or resp.is_pdf() or resp.is_swf(): original_url = resp.get_redir_uri() try: doc_parser = parser_cache.dpc.get_document_parser_for(resp) except w3afException, w3: om.out.debug('Failed to find a suitable document parser. ' 'Exception "%s"' % w3) else: # Note: # - With parsed_refs I'm 100% that it's really # something in the HTML that the developer intended to add. # # - The re_refs are the result of regular expressions, # which in some cases are just false positives. parsed_refs, re_refs = doc_parser.get_references() # I also want to analyze all directories, if the URL I just # fetched is: # http://localhost/a/b/c/f00.php I want to GET: # http://localhost/a/b/c/ # http://localhost/a/b/ # http://localhost/a/ # http://localhost/ # And analyze the responses... dirs = resp.get_url().get_directories() only_re_refs = set(re_refs) - set(dirs + parsed_refs) all_refs = itertools.chain(dirs, parsed_refs, re_refs) for ref in unique_justseen(sorted(all_refs)): # I don't want w3af sending requests to 3rd parties! if ref.get_domain() != self._target_domain: continue # Filter the URL's according to the configured regexs urlstr = ref.url_string if not self._compiled_follow_re.match(urlstr) or \ self._compiled_ignore_re.match(urlstr): continue if self._only_forward: if not self._is_forward(ref): continue # Work with the parsed references and report broken # links. Then work with the regex references and DO NOT # report broken links if self._need_more_variants(ref): self._known_variants.append(ref) possibly_broken = ref in only_re_refs yield ref, fuzzable_req, original_url, possibly_broken
class web_spider(CrawlPlugin): """ Crawl the web application. :author: Andres Riancho ([email protected]) """ NOT_404 = set([http_constants.UNAUTHORIZED, http_constants.FORBIDDEN]) def __init__(self): CrawlPlugin.__init__(self) # Internal variables self._compiled_ignore_re = None self._compiled_follow_re = None self._broken_links = DiskSet() self._first_run = True self._known_variants = VariantDB() self._already_filled_form = ScalableBloomFilter() # User configured variables self._ignore_regex = "" self._follow_regex = ".*" self._only_forward = False self._compile_re() def crawl(self, fuzzable_req): """ Searches for links on the html. :param fuzzable_req: A fuzzable_req instance that contains (among other things) the URL to test. """ if self._first_run: # I have to set some variables, in order to be able to code # the "only_forward" feature self._first_run = False self._target_urls = [i.get_domain_path() for i in cf.cf.get("targets")] # The following line triggered lots of bugs when the "stop" button # was pressed and the core did this: "cf.cf.save('targets', [])" # self._target_domain = cf.cf.get('targets')[0].get_domain() # Changing it to something awful but bug-free. targets = cf.cf.get("targets") if not targets: return else: self._target_domain = targets[0].get_domain() # # If it is a form, then smart_fill the parameters to send something that # makes sense and will allow us to cover more code. # if isinstance(fuzzable_req, HTTPPostDataRequest): if fuzzable_req.get_url() in self._already_filled_form: return fuzzable_req = self._fill_form(fuzzable_req) # Send the HTTP request, resp = self._uri_opener.send_mutant(fuzzable_req) # Nothing to do here... if resp.get_code() == 401: return fuzz_req_list = self._create_fuzzable_requests(resp, request=fuzzable_req, add_self=False) for fr in fuzz_req_list: self.output_queue.put(fr) self._extract_links_and_verify(resp, fuzzable_req) def _urls_to_verify_generator(self, resp, fuzzable_req): """ :param resp: HTTP response object :param fuzzable_req: The HTTP request that generated the response """ # # Note: I WANT to follow links that are in the 404 page. # # Modified when I added the PDFParser # I had to add this x OR y stuff, just because I don't want # the SGML parser to analyze a image file, its useless and # consumes CPU power. if resp.is_text_or_html() or resp.is_pdf() or resp.is_swf(): original_url = resp.get_redir_uri() try: doc_parser = parser_cache.dpc.get_document_parser_for(resp) except w3afException, w3: om.out.debug("Failed to find a suitable document parser. " 'Exception "%s"' % w3) else: # Note: # - With parsed_refs I'm 100% that it's really # something in the HTML that the developer intended to add. # # - The re_refs are the result of regular expressions, # which in some cases are just false positives. parsed_refs, re_refs = doc_parser.get_references() # I also want to analyze all directories, if the URL I just # fetched is: # http://localhost/a/b/c/f00.php I want to GET: # http://localhost/a/b/c/ # http://localhost/a/b/ # http://localhost/a/ # http://localhost/ # And analyze the responses... dirs = resp.get_url().get_directories() only_re_refs = set(re_refs) - set(dirs + parsed_refs) all_refs = itertools.chain(dirs, parsed_refs, re_refs) for ref in unique_justseen(sorted(all_refs)): # I don't want w3af sending requests to 3rd parties! if ref.get_domain() != self._target_domain: continue # Filter the URL's according to the configured regexs urlstr = ref.url_string if not self._compiled_follow_re.match(urlstr) or self._compiled_ignore_re.match(urlstr): continue if self._only_forward: if not self._is_forward(ref): continue # Work with the parsed references and report broken # links. Then work with the regex references and DO NOT # report broken links if self._need_more_variants(ref): self._known_variants.append(ref) possibly_broken = ref in only_re_refs yield ref, fuzzable_req, original_url, possibly_broken
def setUp(self): create_temp_dir() self.vdb = VariantDB()
class TestVariantDB(unittest.TestCase): def setUp(self): create_temp_dir() self.vdb = VariantDB() def test_db_int(self): url_fmt = 'http://w3af.org/foo.htm?id=%s' _max = 5 for i in xrange(_max): url = URL(url_fmt % i) self.assertTrue(self.vdb.need_more_variants(url)) self.vdb.append(url) self.assertFalse( self.vdb.need_more_variants(URL(url_fmt % (_max + 1, )))) def test_db_int_int(self): url_fmt = 'http://w3af.org/foo.htm?id=%s&bar=1' _max = 5 for i in xrange(_max): url = URL(url_fmt % i) self.assertTrue(self.vdb.need_more_variants(url)) self.vdb.append(url) self.assertFalse( self.vdb.need_more_variants(URL(url_fmt % (_max + 1, )))) def test_db_int_int_var(self): url_fmt = 'http://w3af.org/foo.htm?id=%s&bar=%s' _max = 5 for i in xrange(_max): url = URL(url_fmt % (i, i)) self.assertTrue(self.vdb.need_more_variants(url)) self.vdb.append(url) self.assertFalse( self.vdb.need_more_variants(URL(url_fmt % (_max + 1, _max + 1)))) def test_db_int_str(self): url_fmt = 'http://w3af.org/foo.htm?id=%s&bar=%s' _max = 5 for i in xrange(_max): url = URL(url_fmt % (i, 'abc' * i)) self.assertTrue(self.vdb.need_more_variants(url)) self.vdb.append(url) self.assertFalse( self.vdb.need_more_variants( URL(url_fmt % (_max + 1, 'abc' * (_max + 1))))) def test_db_int_str_then_int_int(self): url_fmt = 'http://w3af.org/foo.htm?id=%s&bar=%s' _max = 5 # Add (int, str) for i in xrange(_max): url = URL(url_fmt % (i, 'abc' * i)) self.assertTrue(self.vdb.need_more_variants(url)) self.vdb.append(url) # Please note that in this case I'm asking for (int, int) and I added # (int, str) before self.assertTrue( self.vdb.need_more_variants(URL(url_fmt % (_max + 1, _max + 1)))) # Add (int, int) for i in xrange(_max): url = URL(url_fmt % (i, i)) self.assertTrue(self.vdb.need_more_variants(url)) self.vdb.append(url) self.assertFalse( self.vdb.need_more_variants(URL(url_fmt % (_max + 1, _max + 1)))) def test_clean_reference_simple(self): self.assertEqual(self.vdb._clean_reference(URL('http://w3af.org/')), u'http://w3af.org/') def test_clean_reference_file(self): self.assertEqual( self.vdb._clean_reference(URL('http://w3af.org/index.php')), u'http://w3af.org/index.php') def test_clean_reference_directory_file(self): self.assertEqual( self.vdb._clean_reference(URL('http://w3af.org/foo/index.php')), u'http://w3af.org/foo/index.php') def test_clean_reference_directory_file_int(self): self.assertEqual( self.vdb._clean_reference( URL('http://w3af.org/foo/index.php?id=2')), u'http://w3af.org/foo/index.php?id=number') def test_clean_reference_int(self): self.assertEqual( self.vdb._clean_reference(URL('http://w3af.org/index.php?id=2')), u'http://w3af.org/index.php?id=number') def test_clean_reference_int_str(self): self.assertEqual( self.vdb._clean_reference( URL('http://w3af.org/index.php?id=2&foo=bar')), u'http://w3af.org/index.php?id=number&foo=string') def test_clean_reference_int_str_empty(self): self.assertEqual( self.vdb._clean_reference( URL('http://w3af.org/index.php?id=2&foo=bar&spam=')), u'http://w3af.org/index.php?id=number&foo=string&spam=string')
class TestVariantDB(unittest.TestCase): def setUp(self): create_temp_dir() self.vdb = VariantDB() def test_db_int(self): url_fmt = 'http://w3af.org/foo.htm?id=%s' _max = 5 for i in xrange(_max): url = URL(url_fmt % i) self.assertTrue(self.vdb.need_more_variants(url)) self.vdb.append(url) self.assertFalse( self.vdb.need_more_variants(URL(url_fmt % (_max + 1,)))) def test_db_int_int(self): url_fmt = 'http://w3af.org/foo.htm?id=%s&bar=1' _max = 5 for i in xrange(_max): url = URL(url_fmt % i) self.assertTrue(self.vdb.need_more_variants(url)) self.vdb.append(url) self.assertFalse( self.vdb.need_more_variants(URL(url_fmt % (_max + 1,)))) def test_db_int_int_var(self): url_fmt = 'http://w3af.org/foo.htm?id=%s&bar=%s' _max = 5 for i in xrange(_max): url = URL(url_fmt % (i, i)) self.assertTrue(self.vdb.need_more_variants(url)) self.vdb.append(url) self.assertFalse( self.vdb.need_more_variants(URL(url_fmt % (_max + 1, _max + 1)))) def test_db_int_str(self): url_fmt = 'http://w3af.org/foo.htm?id=%s&bar=%s' _max = 5 for i in xrange(_max): url = URL(url_fmt % (i, 'abc' * i)) self.assertTrue(self.vdb.need_more_variants(url)) self.vdb.append(url) self.assertFalse(self.vdb.need_more_variants( URL(url_fmt % (_max + 1, 'abc' * (_max + 1))))) def test_db_int_str_then_int_int(self): url_fmt = 'http://w3af.org/foo.htm?id=%s&bar=%s' _max = 5 # Add (int, str) for i in xrange(_max): url = URL(url_fmt % (i, 'abc' * i)) self.assertTrue(self.vdb.need_more_variants(url)) self.vdb.append(url) # Please note that in this case I'm asking for (int, int) and I added # (int, str) before self.assertTrue( self.vdb.need_more_variants(URL(url_fmt % (_max + 1, _max + 1)))) # Add (int, int) for i in xrange(_max): url = URL(url_fmt % (i, i)) self.assertTrue(self.vdb.need_more_variants(url)) self.vdb.append(url) self.assertFalse( self.vdb.need_more_variants(URL(url_fmt % (_max + 1, _max + 1)))) def test_clean_reference_simple(self): self.assertEqual(self.vdb._clean_reference(URL('http://w3af.org/')), u'http://w3af.org/') def test_clean_reference_file(self): self.assertEqual( self.vdb._clean_reference(URL('http://w3af.org/index.php')), u'http://w3af.org/index.php') def test_clean_reference_directory_file(self): self.assertEqual( self.vdb._clean_reference(URL('http://w3af.org/foo/index.php')), u'http://w3af.org/foo/index.php') def test_clean_reference_directory_file_int(self): self.assertEqual( self.vdb._clean_reference(URL('http://w3af.org/foo/index.php?id=2')), u'http://w3af.org/foo/index.php?id=number') def test_clean_reference_int(self): self.assertEqual( self.vdb._clean_reference(URL('http://w3af.org/index.php?id=2')), u'http://w3af.org/index.php?id=number') def test_clean_reference_int_str(self): self.assertEqual( self.vdb._clean_reference( URL('http://w3af.org/index.php?id=2&foo=bar')), u'http://w3af.org/index.php?id=number&foo=string') def test_clean_reference_int_str_empty(self): self.assertEqual( self.vdb._clean_reference( URL('http://w3af.org/index.php?id=2&foo=bar&spam=')), u'http://w3af.org/index.php?id=number&foo=string&spam=string')