def _setup_404_detection(self): # # NOTE: I need to perform this test here in order to avoid some weird # thread locking that happens when the webspider calls is_404, and # because I want to initialize the is_404 database in a controlled # try/except block. # from w3af.core.controllers.core_helpers.fingerprint_404 import is_404 for url in cf.cf.get('targets'): try: response = self._w3af_core.uri_opener.GET(url, cache=True) except ScanMustStopByUserRequest: raise except Exception, e: msg = ('Failed to send HTTP request to the configured target' ' URL "%s", the original exception was: "%s" (%s).') args = (url, e, e.__class__.__name__) raise ScanMustStopException(msg % args) try: is_404(response) except ScanMustStopByUserRequest: raise except Exception, e: msg = ('Failed to initialize the 404 detection using HTTP' ' response from "%s", the original exception was: "%s"' ' (%s).') args = (url, e, e.__class__.__name__) raise ScanMustStopException(msg % args)
def get_document_parser_for(self, http_response): """ Get a document parser for http_response This parses the http_response in a pool worker. This method has two features: * We can kill the worker if the parser is taking too long * We can have different parsers :param http_response: The http response instance :return: An instance of DocumentParser """ # Start the worker processes if needed self.start_workers() apply_args = (process_document_parser, http_response, self.DEBUG) # Push the task to the workers try: future = self._pool.schedule(apply_with_return_error, args=(apply_args, ), timeout=self.PARSER_TIMEOUT) except RuntimeError, rte: # We get here when the pebble pool management thread dies and # suddenly starts answering all calls with: # # RuntimeError('Unexpected error within the Pool') # # The scan needs to stop because we can't parse any more # HTTP responses, which is a very critical part of the process msg = str(rte) raise ScanMustStopException(msg)
def analyze_state(): # There might be errors that make us stop the process if self._error_stopped: msg = 'Multiple exceptions found while sending HTTP requests.' raise ScanMustStopException(msg) if self._user_stopped: msg = 'The user stopped the scan.' raise ScanMustStopByUserRequest(msg)
def get_tags_by_filter(self, http_response, tags, yield_text=False): """ Return Tag instances for the tags which match the `tags` filter, parsing and all lxml stuff is done in another process and the Tag instances are sent to the main process (the one calling this method) through a pipe Some things to note: * Not all responses can be parsed, so I need to call DocumentParser and handle exceptions * The parser selected by DocumentParser might not have tags, and it might not have get_tags_by_filter. In this case just return an empty list * Just like get_document_parser_for we have a timeout in place, when we hit the timeout just return an empty list, this is not the best thing to do, but makes the plugin code easier to write (plugins would ignore this anyways) :param tags: The filter :param yield_text: Should we yield the tag text? :return: A list of Tag instances as defined in sgml.py :see: SGMLParser.get_tags_by_filter """ # Start the worker processes if needed self.start_workers() filename = write_http_response_to_temp_file(http_response) apply_args = (process_get_tags_by_filter, filename, tags, yield_text, self.DEBUG) # # Push the task to the workers # try: future = self._pool.schedule(apply_with_return_error, args=(apply_args,), timeout=self.PARSER_TIMEOUT) except RuntimeError, rte: # Remove the temp file used to send data to the process remove_file_if_exists(filename) # We get here when the pebble pool management thread dies and # suddenly starts answering all calls with: # # RuntimeError('Unexpected error within the Pool') # # The scan needs to stop because we can't parse any more # HTTP responses, which is a very critical part of the process msg = str(rte) raise ScanMustStopException(msg)
def verify_target_server_up(self): """ Well, it is more common than expected that the user configures a target which is offline, is not a web server, etc. So we're going to verify all that before even starting our work, and provide a nice error message so that users can change their config if needed. Note that we send MAX_ERROR_COUNT tests to the remote end in order to trigger any errors in the remote end and have the Extended URL Library error handle return errors. :raises: A friendly exception with lots of details of what could have happen. """ sent_requests = 0 msg = ( 'The remote web server is not answering our HTTP requests,' ' multiple errors have been found while trying to GET a response' ' from the server.\n\n' 'In most cases this means that the configured target is' ' incorrect, the port is closed, there is a firewall blocking' ' our packets or there is no HTTP daemon listening on that' ' port.\n\n' 'Please verify your target configuration and try again. The' ' tested targets were:\n\n' ' %s\n') targets = cf.cf.get('targets') while sent_requests < MAX_ERROR_COUNT * 1.5: for url in targets: try: self._w3af_core.uri_opener.GET(url, cache=False) except ScanMustStopByUserRequest: # Not a real error, the user stopped the scan raise except Exception, e: dbg = 'Exception found during verify_target_server_up: "%s"' om.out.debug(dbg % e) target_list = '\n'.join(' - %s\n' % url for url in targets) raise ScanMustStopException(msg % target_list) else: sent_requests += 1
def _setup_404_detection(self): # # NOTE: I need to perform this test here in order to avoid some weird # thread locking that happens when the webspider calls is_404, and # because I want to initialize the is_404 database in a controlled # try/except block. # from w3af.core.controllers.core_helpers.fingerprint_404 import is_404 for url in cf.cf.get('targets'): try: response = self._w3af_core.uri_opener.GET(url, cache=True) is_404(response) except ScanMustStopByUserRequest: raise except Exception, e: msg = 'Failed to initialize the 404 detection, original' \ ' exception was: "%s".' raise ScanMustStopException(msg % e)
def test_teardown_with_must_stop_exception(self): w3af_core = w3afCore() xss_instance = xss() xss_instance.set_url_opener(w3af_core.uri_opener) xss_instance.set_worker_pool(w3af_core.worker_pool) audit_plugins = [xss_instance] audit_consumer = audit(audit_plugins, w3af_core) audit_consumer.start() url = 'http://w3af.org/?id=1' httpretty.register_uri(httpretty.GET, url, body='hello world', content_type='application/html') url = URL(url) fr = FuzzableRequest(url) # This will trigger a few HTTP requests to the target URL which will # also initialize all the xss plugin internals to be able to run end() # later. audit_consumer.in_queue_put(fr) kb.kb.add_fuzzable_request(fr) # Now that xss.audit() was called, we want to simulate network errors # that will put the uri opener in a state where it always answers with # ScanMustStopException w3af_core.uri_opener._stop_exception = ScanMustStopException('mock') # And now we just call terminate() which injects the poison pill and will # call teardown, which should call xss.end(), which should try to send HTTP # requests, which will raise a ScanMustStopException with patch('w3af.core.controllers.core_helpers.consumers.audit.om.out' ) as om_mock: audit_consumer.terminate() msg = ('Spent 0.00 seconds running xss.end() until a scan must' ' stop exception was raised.') self.assertIn(call.debug(msg), om_mock.mock_calls)
def store_in_cache(request, response): # Create the http response object resp = HTTPResponse.from_httplib_resp(response, original_url=request.url_object) resp.set_id(response.id) resp.set_alias(gen_hash(request)) hi = HistoryItem() hi.request = request hi.response = resp # Now save them try: hi.save() except sqlite3.Error, e: msg = 'A sqlite3 error was raised: "%s".' % e if 'disk' in str(e).lower(): msg += ' Please check if your disk is full.' raise ScanMustStopException(msg)
def get_tags_by_filter(self, http_response, tags, yield_text=False, cache=True): """ Get specific tags from http_response using the cache if possible :param http_response: The http response instance :param tags: List of tags to get, or None if all tags should be returned :param yield_text: Include the tag text (<a>text</a>) :param cache: True if the document parser should be saved to the cache :return: An instance of DocumentParser """ # # This is a performance hack that should reduce the time consumed by # this method without impacting its results. Note that in HTML this is # valid: # # <script # # And this is invalid: # # < script # # We use that in order to speed-up this function # if tags is not None: body_lower = http_response.get_body().lower() for tag in tags: lt_tag = '<%s' % tag if lt_tag in body_lower: break else: # No tag was found in the HTML return [] # # Before doing anything too complex like caching, sending the HTTP # response to a different process for parsing, checking events, etc. # check if we can parse this HTTP response. # # This is a performance improvement that works *only if* the # DocumentParser.can_parse call is *fast*, which means that the # `can_parse` implementations of each parser needs to be fast # # It doesn't matter if we say "yes" here and then parsing exceptions # appear later, that should be a 1 / 10000 calls and we would still # be gaining a lot of performance # if not self.can_parse(http_response): self._log_return_empty(http_response, 'No parser available') return [] args = '%r%r' % (tags, yield_text) hash_string = get_body_unique_id(http_response, prepend=args) if hash_string in self._parser_blacklist: self._log_return_empty(http_response, 'HTTP response is blacklisted') return [] # # We know that we can parse this document, lets work! # parser_finished = self._parser_finished_events.get(hash_string, None) if parser_finished is not None: # There is one subprocess already processing this http response # body, the best thing to do here is to make this thread wait # until that process has finished wait_result = parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT) if not wait_result: # Act just like when there is no parser self._log_return_empty(http_response, 'Timeout waiting for response') return [] # metric increase self.inc_query_count() parser = self._cache.get(hash_string, None) if parser is not None: self._handle_cache_hit(hash_string) return parser else: # Not in cache, have to work. self._handle_cache_miss(hash_string) # Create a new instance of DocumentParser, add it to the cache event = threading.Event() self._parser_finished_events[hash_string] = event try: tags = mp_doc_parser.get_tags_by_filter(http_response, tags, yield_text=yield_text) except TimeoutError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles trying # to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser self._log_return_empty(http_response, 'Timeout waiting for get_tags_by_filter()') return [] except MemoryError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles or # memory trying to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser self._log_return_empty(http_response, 'Reached memory usage limit') return [] except ScanMustStopException, e: msg = 'The document parser is in an invalid state! %s' raise ScanMustStopException(msg % e) except Exception, e: # Act just like when there is no parser msg = 'Unhandled exception running get_tags_by_filter("%s"): %s' args = (http_response.get_url(), e) raise BaseFrameworkException(msg % args)
def get_document_parser_for(self, http_response, cache=True): """ Get a document parser for http_response using the cache if possible :param http_response: The http response instance :param cache: True if the document parser should be saved to the cache :return: An instance of DocumentParser """ # # Before doing anything too complex like caching, sending the HTTP # response to a different process for parsing, checking events, etc. # check if we can parse this HTTP response. # # This is a performance improvement that works *only if* the # DocumentParser.can_parse call is *fast*, which means that the # `can_parse` implementations of each parser needs to be fast # # It doesn't matter if we say "yes" here and then parsing exceptions # appear later, that should be a 1 / 10000 calls and we would still # be gaining a lot of performance # if not self.can_parse(http_response): msg = 'There is no parser for "%s".' raise BaseFrameworkException(msg % http_response.get_url()) hash_string = get_response_unique_id(http_response) if hash_string in self._parser_blacklist: msg = 'Exceeded timeout while parsing "%s" in the past. Not trying again.' raise BaseFrameworkException(msg % http_response.get_url()) # # We know that we can parse this document, lets work! # parser_finished = self._parser_finished_events.get(hash_string, None) if parser_finished is not None: # There is one subprocess already processing this http response # body, the best thing to do here is to make this thread wait # until that process has finished wait_result = parser_finished.wait(timeout=mp_doc_parser.PARSER_TIMEOUT) if not wait_result: # Act just like when there is no parser msg = 'There is no parser for "%s". Waited more than %s sec.' args = (http_response.get_url(), mp_doc_parser.PARSER_TIMEOUT) raise BaseFrameworkException(msg % args) # metric increase self.inc_query_count() parser = self._cache.get(hash_string, None) if parser is not None: self._handle_cache_hit(hash_string) return parser else: # Not in cache, have to work. self._handle_cache_miss(hash_string) # Create a new instance of DocumentParser, add it to the cache event = threading.Event() self._parser_finished_events[hash_string] = event try: parser = mp_doc_parser.get_document_parser_for(http_response) except TimeoutError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles trying # to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser msg = 'Reached timeout parsing "%s".' % http_response.get_url() raise BaseFrameworkException(msg) except MemoryError: # We failed to get a parser for this HTTP response, we better # ban this HTTP response so we don't waste more CPU cycles or # memory trying to parse it over and over. self.add_to_blacklist(hash_string) # Act just like when there is no parser msg = 'Reached memory usage limit parsing "%s".' % http_response.get_url() raise BaseFrameworkException(msg) except ScanMustStopException, e: msg = 'The document parser is in an invalid state! %s' raise ScanMustStopException(msg % e) except:
def alert_if_target_is_301_all(self): """ Alert the user when the configured target is set to a site which will 301 redirect all requests to https:// :see: https://github.com/andresriancho/w3af/issues/14976 :return: True if the site returns 301 for all resources. Also an Info instance is saved to the KB in order to alert the user. """ site_does_redirect = False msg = ('The configured target domain redirects all HTTP requests to a' ' different location. The most common scenarios are:\n\n' '' ' * HTTP redirect to HTTPS\n' ' * domain.com redirect to www.domain.com\n\n' '' 'While the scan engine can identify URLs and vulnerabilities' ' using the current configuration it might be wise to start' ' a new scan setting the target URL to the redirect target.') targets = cf.cf.get('targets') for url in targets: # We test if the target URLs are redirecting to a different protocol # or domain. try: http_response = self._w3af_core.uri_opener.GET(url, cache=False) except ScanMustStopByUserRequest: # Not a real error, the user stopped the scan raise except Exception, e: emsg = 'Exception found during alert_if_target_is_301_all(): "%s"' emsg %= e om.out.debug(emsg) raise ScanMustStopException(emsg) else: if 300 <= http_response.get_code() <= 399: # Get the redirect target lower_headers = http_response.get_lower_case_headers() redirect_url = None for header_name in ('location', 'uri'): if header_name in lower_headers: header_value = lower_headers[header_name] header_value = header_value.strip() try: redirect_url = URL(header_value) except ValueError: # No special invalid URL handling required continue if not redirect_url: continue # Check if the protocol was changed: target_proto = url.get_protocol() redirect_proto = redirect_url.get_protocol() if target_proto != redirect_proto: site_does_redirect = True break # Check if the domain was changed: target_domain = url.get_domain() redirect_domain = redirect_url.get_domain() if target_domain != redirect_domain: site_does_redirect = True break