def robots_precheck(self, iri: str) -> bool: """ If we have the robots.txt file available, check it to see if the request is permissible. This does not fetch robots.txt. """ robot_fetcher = RobotFetcher() try: return robot_fetcher.check_robots(HttpRequest.iri_to_uri(iri), sync=True) except (UnicodeError, ValueError): return True
def run_test(self) -> None: """Test a URI.""" # try to initialise stored test results if self.config.get('save_dir', "") and os.path.exists(self.config['save_dir']): try: fd, self.save_path = tempfile.mkstemp(prefix='', dir=self.config['save_dir']) self.test_id = os.path.split(self.save_path)[1] except (OSError, IOError): # Don't try to store it. self.test_id = None # should already be None, but make sure top_resource = HttpResource(self.config, descend=self.descend) self.timeout = thor.schedule(int(self.config['max_runtime']), self.timeoutError, top_resource.show_task_map) top_resource.set_request(self.test_uri, req_hdrs=self.req_hdrs) formatter = find_formatter(self.format, 'html', self.descend)( self.config, self.output, allow_save=self.test_id, is_saved=False, test_id=self.test_id, descend=self.descend) # referer limiting referers = [] for hdr, value in self.req_hdrs: if hdr.lower() == 'referer': referers.append(value) referer_error = None if len(referers) > 1: referer_error = "Multiple referers not allowed." if referers and urlsplit(referers[0]).hostname in self.referer_spam_domains: referer_error = "Referer not allowed." if referer_error: self.response_start(b"403", b"Forbidden", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"max-age=360, must-revalidate")]) formatter.start_output() formatter.error_output(referer_error) self.response_done([]) return # robot human check if self.robot_time and self.robot_time.isdigit() and self.robot_hmac: valid_till = int(self.robot_time) computed_hmac = hmac.new(self._robot_secret, bytes(self.robot_time, 'ascii')) is_valid = self.robot_hmac == computed_hmac.hexdigest() if is_valid and valid_till >= thor.time(): self.continue_test(top_resource, formatter) return else: self.response_start(b"403", b"Forbidden", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"max-age=60, must-revalidate")]) formatter.start_output() formatter.error_output("Naughty.") self.response_done([]) self.error_log("Naughty robot key.") # enforce client limits if self.config.getint('limit_client_tests', fallback=0): client_id = self.get_client_id() if client_id: if self._client_counts.get(client_id, 0) > \ self.config.getint('limit_client_tests'): self.response_start(b"429", b"Too Many Requests", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"max-age=60, must-revalidate")]) formatter.start_output() formatter.error_output("Your client is over limit. Please try later.") self.response_done([]) self.error_log("client over limit: %s" % client_id.decode('idna')) return self._client_counts[client_id] += 1 # enforce origin limits if self.config.getint('limit_origin_tests', fallback=0): origin = url_to_origin(self.test_uri) if origin: if self._origin_counts.get(origin, 0) > \ self.config.getint('limit_origin_tests'): self.response_start(b"429", b"Too Many Requests", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"max-age=60, must-revalidate")]) formatter.start_output() formatter.error_output("Origin is over limit. Please try later.") self.response_done([]) self.error_log("origin over limit: %s" % origin) return self._origin_counts[origin] += 1 # check robots.txt robot_fetcher = RobotFetcher(self.config) @thor.events.on(robot_fetcher) def robot(results: Tuple[str, bool]) -> None: url, robot_ok = results if robot_ok: self.continue_test(top_resource, formatter) else: valid_till = str(int(thor.time()) + 60) robot_hmac = hmac.new(self._robot_secret, bytes(valid_till, 'ascii')) self.response_start(b"403", b"Forbidden", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"no-cache")]) formatter.start_output() formatter.error_output("This site doesn't allow robots. If you are human, please <a href='?uri=%s&robot_time=%s&robot_hmac=%s'>click here</a>." % (self.test_uri, valid_till, robot_hmac.hexdigest()) ) self.response_done([]) robot_fetcher.check_robots(HttpRequest.iri_to_uri(self.test_uri))
def run_test(self) -> None: """Test a URI.""" # try to initialise stored test results if self.config.get("save_dir", "") and os.path.exists( self.config["save_dir"]): try: fd, self.save_path = tempfile.mkstemp( prefix="", dir=self.config["save_dir"]) self.test_id = os.path.split(self.save_path)[1] except (OSError, IOError): # Don't try to store it. self.test_id = None # should already be None, but make sure top_resource = HttpResource(self.config, descend=self.descend) self.timeout = thor.schedule( int(self.config["max_runtime"]), self.timeoutError, top_resource.show_task_map, ) top_resource.set_request(self.test_uri, req_hdrs=self.req_hdrs) formatter = find_formatter(self.format, "html", self.descend)( self.config, self.output, allow_save=self.test_id, is_saved=False, test_id=self.test_id, descend=self.descend, ) # referer limiting referers = [] for hdr, value in self.req_hdrs: if hdr.lower() == "referer": referers.append(value) referer_error = None if len(referers) > 1: referer_error = "Multiple referers not allowed." if referers and urlsplit( referers[0]).hostname in self.referer_spam_domains: referer_error = "Referer not allowed." if referer_error: self.response_start( b"403", b"Forbidden", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"max-age=360, must-revalidate"), ], ) formatter.start_output() formatter.error_output(referer_error) self.response_done([]) return # robot human check if self.robot_time and self.robot_time.isdigit() and self.robot_hmac: valid_till = int(self.robot_time) computed_hmac = hmac.new(self._robot_secret, bytes(self.robot_time, "ascii")) is_valid = self.robot_hmac == computed_hmac.hexdigest() if is_valid and valid_till >= thor.time(): self.continue_test(top_resource, formatter) return else: self.response_start( b"403", b"Forbidden", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"max-age=60, must-revalidate"), ], ) formatter.start_output() formatter.error_output("Naughty.") self.response_done([]) self.error_log("Naughty robot key.") # enforce client limits if self.config.getint("limit_client_tests", fallback=0): client_id = self.get_client_id() if client_id: if self._client_counts.get( client_id, 0) > self.config.getint("limit_client_tests"): self.response_start( b"429", b"Too Many Requests", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"max-age=60, must-revalidate"), ], ) formatter.start_output() formatter.error_output( "Your client is over limit. Please try later.") self.response_done([]) self.error_log("client over limit: %s" % client_id.decode("idna")) return self._client_counts[client_id] += 1 # enforce origin limits if self.config.getint("limit_origin_tests", fallback=0): origin = url_to_origin(self.test_uri) if origin: if self._origin_counts.get( origin, 0) > self.config.getint("limit_origin_tests"): self.response_start( b"429", b"Too Many Requests", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"max-age=60, must-revalidate"), ], ) formatter.start_output() formatter.error_output( "Origin is over limit. Please try later.") self.response_done([]) self.error_log("origin over limit: %s" % origin) return self._origin_counts[origin] += 1 # check robots.txt robot_fetcher = RobotFetcher(self.config) @thor.events.on(robot_fetcher) def robot(results: Tuple[str, bool]) -> None: url, robot_ok = results if robot_ok: self.continue_test(top_resource, formatter) else: valid_till = str(int(thor.time()) + 60) robot_hmac = hmac.new(self._robot_secret, bytes(valid_till, "ascii")) self.response_start( b"403", b"Forbidden", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"no-cache"), ], ) formatter.start_output() formatter.error_output( "This site doesn't allow robots. If you are human, please <a href='?uri=%s&robot_time=%s&robot_hmac=%s'>click here</a>." % (self.test_uri, valid_till, robot_hmac.hexdigest())) self.response_done([]) robot_fetcher.check_robots(HttpRequest.iri_to_uri(self.test_uri))