def robots_precheck(self, iri: str) -> bool: """ If we have the robots.txt file available, check it to see if the request is permissible. This does not fetch robots.txt. """ robot_fetcher = RobotFetcher() try: return robot_fetcher.check_robots(HttpRequest.iri_to_uri(iri), sync=True) except (UnicodeError, ValueError): return True
def __init__(self, config: SectionProxy) -> None: thor.events.EventEmitter.__init__(self) self.config = config self.robot_fetcher = RobotFetcher(self.config) self.notes = [] # type: List[Note] self.transfer_in = 0 self.transfer_out = 0 self.request = HttpRequest(self.ignore_note) # type: HttpRequest self.nonfinal_responses = [] # type: List[HttpResponse] self.response = HttpResponse(self.add_note) # type: HttpResponse self.exchange = None # type: thor.http.ClientExchange self.follow_robots_txt = True # Should we pay attention to robots file? self.fetch_started = False self.fetch_done = False
def __init__(self, config: SectionProxy) -> None: thor.events.EventEmitter.__init__(self) self.config = config self.robot_fetcher = RobotFetcher(self.config) self.notes = [] # type: List[Note] self.transfer_in = 0 self.transfer_out = 0 self.request = HttpRequest(self.ignore_note) # type: HttpRequest self.nonfinal_responses = [] # type: List[HttpResponse] self.response = HttpResponse(self.add_note) # type: HttpResponse self.exchange = None # type: thor.http.ClientExchange self.follow_robots_txt = False # Should we pay attention to robots file? self.fetch_started = False self.fetch_done = False
class RedFetcher(thor.events.EventEmitter): """ Abstract class for a fetcher. Fetches the given URI (with the provided method, headers and body) and: - emits 'status' and 'debug' as it progresses - emits 'fetch_done' when the fetch is finished. If provided, 'name' indicates the type of the request, and is used to help set notes and status events appropriately. """ check_name = "undefined" response_phrase = "undefined" client = RedHttpClient() client.idle_timeout = 5 robot_emitter = thor.events.EventEmitter() def __init__(self, config: SectionProxy) -> None: thor.events.EventEmitter.__init__(self) self.config = config self.robot_fetcher = RobotFetcher(self.config) self.notes = [] # type: List[Note] self.transfer_in = 0 self.transfer_out = 0 self.request = HttpRequest(self.ignore_note) # type: HttpRequest self.nonfinal_responses = [] # type: List[HttpResponse] self.response = HttpResponse(self.add_note) # type: HttpResponse self.exchange = None # type: thor.http.ClientExchange self.follow_robots_txt = True # Should we pay attention to robots file? self.fetch_started = False self.fetch_done = False def __getstate__(self) -> Dict[str, Any]: state = thor.events.EventEmitter.__getstate__(self) del state['exchange'] return state def __repr__(self) -> str: out = [self.__class__.__name__] if self.request.uri: out.append("%s" % self.request.uri) if self.fetch_started: out.append("fetch_started") if self.fetch_done: out.append("fetch_done") return "<%s at %#x>" % (", ".join(out), id(self)) def add_note(self, subject: str, note: Type[Note], **kw: Union[str, int]) -> None: "Set a note." if 'response' not in kw: kw['response'] = self.response_phrase self.notes.append(note(subject, kw)) def ignore_note(self, subject: str, note: Type[Note], **kw: str) -> None: "Ignore a note (for requests)." return def preflight(self) -> bool: """ Check to see if we should bother running. Return True if so; False if not. Can be overridden. """ return True def set_request(self, iri: str, method: str = "GET", req_hdrs: StrHeaderListType = None, req_body: bytes = None) -> None: """ Set the resource's request. All values are strings. """ self.request.method = method self.response.is_head_response = (method == "HEAD") # type: ignore try: self.request.set_iri(iri) except httperr.UrlError as why: self.response.http_error = why self.response.base_uri = self.request.uri # type: ignore if req_hdrs: self.request.set_headers(req_hdrs) self.request.payload = req_body # type: ignore # FIXME: encoding self.request.complete = True # cheating a bit def check(self) -> None: """ Make an asynchronous HTTP request to uri, emitting 'status' as it's updated and 'fetch_done' when it's done. Reason is used to explain what the request is in the status callback. """ if not self.preflight() or self.request.uri is None: # generally a good sign that we're not going much further. self._fetch_done() return if self.follow_robots_txt: self.robot_fetcher.emitter.once("robot-%s" % self.request.uri, self.run_continue) self.robot_fetcher.check_robots(self.request.uri) else: self.run_continue(True) def run_continue(self, allowed: bool) -> None: """ Continue after getting the robots file. """ if not allowed: self.response.http_error = RobotsTxtError() self._fetch_done() return self.fetch_started = True if 'user-agent' not in [i[0].lower() for i in self.request.headers]: self.request.headers.append(("User-Agent", UA_STRING)) self.exchange = self.client.exchange() self.exchange.on('response_nonfinal', self._response_nonfinal) self.exchange.once('response_start', self._response_start) self.exchange.on('response_body', self._response_body) self.exchange.once('response_done', self._response_done) self.exchange.on('error', self._response_error) self.emit("status", "fetching %s (%s)" % (self.request.uri, self.check_name)) self.emit("debug", "fetching %s (%s)" % (self.request.uri, self.check_name)) req_hdrs = [ (k.encode('ascii', 'replace'), v.encode('ascii', 'replace')) for (k, v) in self.request.headers ] # FIXME: should complain self.exchange.request_start(self.request.method.encode('ascii'), self.request.uri.encode('ascii'), req_hdrs) self.request.start_time = thor.time() if not self.fetch_done: # the request could have immediately failed. if self.request.payload is not None: self.exchange.request_body(self.request.payload) self.transfer_out += len(self.request.payload) if not self.fetch_done: # the request could have immediately failed. self.exchange.request_done([]) def _response_nonfinal(self, status: bytes, phrase: bytes, res_headers: RawHeaderListType) -> None: "Got a non-final response." nfres = HttpResponse(self.add_note) nfres.process_top_line(self.exchange.res_version, status, phrase) nfres.process_raw_headers(res_headers) StatusChecker(nfres, self.request) self.nonfinal_responses.append(nfres) def _response_start(self, status: bytes, phrase: bytes, res_headers: RawHeaderListType) -> None: "Process the response start-line and headers." self.response.start_time = thor.time() self.response.process_top_line(self.exchange.res_version, status, phrase) self.response.process_raw_headers(res_headers) StatusChecker(self.response, self.request) checkCaching(self.response, self.request) def _response_body(self, chunk: bytes) -> None: "Process a chunk of the response body." self.transfer_in += len(chunk) self.response.feed_body(chunk) def _response_done(self, trailers: List[Tuple[bytes, bytes]]) -> None: "Finish analysing the response, handling any parse errors." self.emit("debug", "fetched %s (%s)" % (self.request.uri, self.check_name)) self.response.transfer_length = self.exchange.input_transfer_length self.response.header_length = self.exchange.input_header_length self.response.body_done(True, trailers) self._fetch_done() def _response_error(self, error: httperr.HttpError) -> None: "Handle an error encountered while fetching the response." self.emit( "debug", "fetch error %s (%s) - %s" % (self.request.uri, self.check_name, error.desc)) err_sample = error.detail[:40] or "" if isinstance(error, httperr.ExtraDataError): if self.response.status_code == "304": self.add_note('body', BODY_NOT_ALLOWED, sample=err_sample) else: self.add_note('body', EXTRA_DATA, sample=err_sample) elif isinstance(error, httperr.ChunkError): self.add_note('header-transfer-encoding', BAD_CHUNK, chunk_sample=err_sample) elif isinstance(error, httperr.HeaderSpaceError): subject = 'header-%s' % (error.detail.lower().strip()) self.add_note(subject, HEADER_NAME_SPACE, header_name=error.detail) else: self.response.http_error = error self._fetch_done() def _fetch_done(self) -> None: if not self.fetch_done: self.fetch_done = True self.exchange = None self.emit("fetch_done")
def run_test(self) -> None: """Test a URI.""" # try to initialise stored test results if self.config.get('save_dir', "") and os.path.exists(self.config['save_dir']): try: fd, self.save_path = tempfile.mkstemp(prefix='', dir=self.config['save_dir']) self.test_id = os.path.split(self.save_path)[1] except (OSError, IOError): # Don't try to store it. self.test_id = None # should already be None, but make sure top_resource = HttpResource(self.config, descend=self.descend) self.timeout = thor.schedule(int(self.config['max_runtime']), self.timeoutError, top_resource.show_task_map) top_resource.set_request(self.test_uri, req_hdrs=self.req_hdrs) formatter = find_formatter(self.format, 'html', self.descend)( self.config, self.output, allow_save=self.test_id, is_saved=False, test_id=self.test_id, descend=self.descend) # referer limiting referers = [] for hdr, value in self.req_hdrs: if hdr.lower() == 'referer': referers.append(value) referer_error = None if len(referers) > 1: referer_error = "Multiple referers not allowed." if referers and urlsplit(referers[0]).hostname in self.referer_spam_domains: referer_error = "Referer not allowed." if referer_error: self.response_start(b"403", b"Forbidden", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"max-age=360, must-revalidate")]) formatter.start_output() formatter.error_output(referer_error) self.response_done([]) return # robot human check if self.robot_time and self.robot_time.isdigit() and self.robot_hmac: valid_till = int(self.robot_time) computed_hmac = hmac.new(self._robot_secret, bytes(self.robot_time, 'ascii')) is_valid = self.robot_hmac == computed_hmac.hexdigest() if is_valid and valid_till >= thor.time(): self.continue_test(top_resource, formatter) return else: self.response_start(b"403", b"Forbidden", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"max-age=60, must-revalidate")]) formatter.start_output() formatter.error_output("Naughty.") self.response_done([]) self.error_log("Naughty robot key.") # enforce client limits if self.config.getint('limit_client_tests', fallback=0): client_id = self.get_client_id() if client_id: if self._client_counts.get(client_id, 0) > \ self.config.getint('limit_client_tests'): self.response_start(b"429", b"Too Many Requests", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"max-age=60, must-revalidate")]) formatter.start_output() formatter.error_output("Your client is over limit. Please try later.") self.response_done([]) self.error_log("client over limit: %s" % client_id.decode('idna')) return self._client_counts[client_id] += 1 # enforce origin limits if self.config.getint('limit_origin_tests', fallback=0): origin = url_to_origin(self.test_uri) if origin: if self._origin_counts.get(origin, 0) > \ self.config.getint('limit_origin_tests'): self.response_start(b"429", b"Too Many Requests", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"max-age=60, must-revalidate")]) formatter.start_output() formatter.error_output("Origin is over limit. Please try later.") self.response_done([]) self.error_log("origin over limit: %s" % origin) return self._origin_counts[origin] += 1 # check robots.txt robot_fetcher = RobotFetcher(self.config) @thor.events.on(robot_fetcher) def robot(results: Tuple[str, bool]) -> None: url, robot_ok = results if robot_ok: self.continue_test(top_resource, formatter) else: valid_till = str(int(thor.time()) + 60) robot_hmac = hmac.new(self._robot_secret, bytes(valid_till, 'ascii')) self.response_start(b"403", b"Forbidden", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"no-cache")]) formatter.start_output() formatter.error_output("This site doesn't allow robots. If you are human, please <a href='?uri=%s&robot_time=%s&robot_hmac=%s'>click here</a>." % (self.test_uri, valid_till, robot_hmac.hexdigest()) ) self.response_done([]) robot_fetcher.check_robots(HttpRequest.iri_to_uri(self.test_uri))
def run_test(self) -> None: """Test a URI.""" # try to initialise stored test results if self.config.get("save_dir", "") and os.path.exists( self.config["save_dir"]): try: fd, self.save_path = tempfile.mkstemp( prefix="", dir=self.config["save_dir"]) self.test_id = os.path.split(self.save_path)[1] except (OSError, IOError): # Don't try to store it. self.test_id = None # should already be None, but make sure top_resource = HttpResource(self.config, descend=self.descend) self.timeout = thor.schedule( int(self.config["max_runtime"]), self.timeoutError, top_resource.show_task_map, ) top_resource.set_request(self.test_uri, req_hdrs=self.req_hdrs) formatter = find_formatter(self.format, "html", self.descend)( self.config, self.output, allow_save=self.test_id, is_saved=False, test_id=self.test_id, descend=self.descend, ) # referer limiting referers = [] for hdr, value in self.req_hdrs: if hdr.lower() == "referer": referers.append(value) referer_error = None if len(referers) > 1: referer_error = "Multiple referers not allowed." if referers and urlsplit( referers[0]).hostname in self.referer_spam_domains: referer_error = "Referer not allowed." if referer_error: self.response_start( b"403", b"Forbidden", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"max-age=360, must-revalidate"), ], ) formatter.start_output() formatter.error_output(referer_error) self.response_done([]) return # robot human check if self.robot_time and self.robot_time.isdigit() and self.robot_hmac: valid_till = int(self.robot_time) computed_hmac = hmac.new(self._robot_secret, bytes(self.robot_time, "ascii")) is_valid = self.robot_hmac == computed_hmac.hexdigest() if is_valid and valid_till >= thor.time(): self.continue_test(top_resource, formatter) return else: self.response_start( b"403", b"Forbidden", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"max-age=60, must-revalidate"), ], ) formatter.start_output() formatter.error_output("Naughty.") self.response_done([]) self.error_log("Naughty robot key.") # enforce client limits if self.config.getint("limit_client_tests", fallback=0): client_id = self.get_client_id() if client_id: if self._client_counts.get( client_id, 0) > self.config.getint("limit_client_tests"): self.response_start( b"429", b"Too Many Requests", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"max-age=60, must-revalidate"), ], ) formatter.start_output() formatter.error_output( "Your client is over limit. Please try later.") self.response_done([]) self.error_log("client over limit: %s" % client_id.decode("idna")) return self._client_counts[client_id] += 1 # enforce origin limits if self.config.getint("limit_origin_tests", fallback=0): origin = url_to_origin(self.test_uri) if origin: if self._origin_counts.get( origin, 0) > self.config.getint("limit_origin_tests"): self.response_start( b"429", b"Too Many Requests", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"max-age=60, must-revalidate"), ], ) formatter.start_output() formatter.error_output( "Origin is over limit. Please try later.") self.response_done([]) self.error_log("origin over limit: %s" % origin) return self._origin_counts[origin] += 1 # check robots.txt robot_fetcher = RobotFetcher(self.config) @thor.events.on(robot_fetcher) def robot(results: Tuple[str, bool]) -> None: url, robot_ok = results if robot_ok: self.continue_test(top_resource, formatter) else: valid_till = str(int(thor.time()) + 60) robot_hmac = hmac.new(self._robot_secret, bytes(valid_till, "ascii")) self.response_start( b"403", b"Forbidden", [ (b"Content-Type", formatter.content_type()), (b"Cache-Control", b"no-cache"), ], ) formatter.start_output() formatter.error_output( "This site doesn't allow robots. If you are human, please <a href='?uri=%s&robot_time=%s&robot_hmac=%s'>click here</a>." % (self.test_uri, valid_till, robot_hmac.hexdigest())) self.response_done([]) robot_fetcher.check_robots(HttpRequest.iri_to_uri(self.test_uri))
class RedFetcher(thor.events.EventEmitter): """ Abstract class for a fetcher. Fetches the given URI (with the provided method, headers and body) and: - emits 'status' and 'debug' as it progresses - emits 'fetch_done' when the fetch is finished. If provided, 'name' indicates the type of the request, and is used to help set notes and status events appropriately. """ check_name = "undefined" response_phrase = "undefined" client = RedHttpClient() client.idle_timeout = 5 robot_emitter = thor.events.EventEmitter() def __init__(self, config: SectionProxy) -> None: thor.events.EventEmitter.__init__(self) self.config = config self.robot_fetcher = RobotFetcher(self.config) self.notes = [] # type: List[Note] self.transfer_in = 0 self.transfer_out = 0 self.request = HttpRequest(self.ignore_note) # type: HttpRequest self.nonfinal_responses = [] # type: List[HttpResponse] self.response = HttpResponse(self.add_note) # type: HttpResponse self.exchange = None # type: thor.http.ClientExchange self.follow_robots_txt = False # Should we pay attention to robots file? self.fetch_started = False self.fetch_done = False def __getstate__(self) -> Dict[str, Any]: state = thor.events.EventEmitter.__getstate__(self) del state['exchange'] return state def __repr__(self) -> str: out = [self.__class__.__name__] if self.request.uri: out.append("%s" % self.request.uri) if self.fetch_started: out.append("fetch_started") if self.fetch_done: out.append("fetch_done") return "<%s at %#x>" % (", ".join(out), id(self)) def add_note(self, subject: str, note: Type[Note], **kw: Union[str, int]) -> None: "Set a note." if 'response' not in kw: kw['response'] = self.response_phrase self.notes.append(note(subject, kw)) def ignore_note(self, subject: str, note: Type[Note], **kw: str) -> None: "Ignore a note (for requests)." return def preflight(self) -> bool: """ Check to see if we should bother running. Return True if so; False if not. Can be overridden. """ return True def set_request(self, iri: str, method: str = "GET", req_hdrs: StrHeaderListType = None, req_body: bytes = None) -> None: """ Set the resource's request. All values are strings. """ self.request.method = method self.response.is_head_response = (method == "HEAD") # type: ignore try: self.request.set_iri(iri) except httperr.UrlError as why: self.response.http_error = why self.response.base_uri = self.request.uri # type: ignore if req_hdrs: self.request.set_headers(req_hdrs) self.request.payload = req_body # type: ignore # FIXME: encoding self.request.complete = True # cheating a bit def check(self) -> None: """ Make an asynchronous HTTP request to uri, emitting 'status' as it's updated and 'fetch_done' when it's done. Reason is used to explain what the request is in the status callback. """ if not self.preflight() or self.request.uri is None: # generally a good sign that we're not going much further. self._fetch_done() return if self.follow_robots_txt: self.robot_fetcher.emitter.once("robot-%s" % self.request.uri, self.run_continue) self.robot_fetcher.check_robots(self.request.uri) else: self.run_continue(True) def run_continue(self, allowed: bool) -> None: """ Continue after getting the robots file. """ if not allowed: self.response.http_error = RobotsTxtError() self._fetch_done() return self.fetch_started = True if 'user-agent' not in [i[0].lower() for i in self.request.headers]: self.request.headers.append(("User-Agent", UA_STRING)) self.exchange = self.client.exchange() self.exchange.on('response_nonfinal', self._response_nonfinal) self.exchange.once('response_start', self._response_start) self.exchange.on('response_body', self._response_body) self.exchange.once('response_done', self._response_done) self.exchange.on('error', self._response_error) self.emit("status", "fetching %s (%s)" % (self.request.uri, self.check_name)) self.emit("debug", "fetching %s (%s)" % (self.request.uri, self.check_name)) req_hdrs = [(k.encode('ascii', 'replace'), v.encode('ascii', 'replace')) for (k, v) in self.request.headers] # FIXME: should complain self.exchange.request_start( self.request.method.encode('ascii'), self.request.uri.encode('ascii'), req_hdrs) self.request.start_time = thor.time() if not self.fetch_done: # the request could have immediately failed. if self.request.payload is not None: self.exchange.request_body(self.request.payload) self.transfer_out += len(self.request.payload) if not self.fetch_done: # the request could have immediately failed. self.exchange.request_done([]) def _response_nonfinal(self, status: bytes, phrase: bytes, res_headers: RawHeaderListType) -> None: "Got a non-final response." nfres = HttpResponse(self.add_note) nfres.process_top_line(self.exchange.res_version, status, phrase) nfres.process_raw_headers(res_headers) StatusChecker(nfres, self.request) self.nonfinal_responses.append(nfres) def _response_start(self, status: bytes, phrase: bytes, res_headers: RawHeaderListType) -> None: "Process the response start-line and headers." self.response.start_time = thor.time() self.response.process_top_line(self.exchange.res_version, status, phrase) self.response.process_raw_headers(res_headers) StatusChecker(self.response, self.request) checkCaching(self.response, self.request) def _response_body(self, chunk: bytes) -> None: "Process a chunk of the response body." self.transfer_in += len(chunk) self.response.feed_body(chunk) def _response_done(self, trailers: List[Tuple[bytes, bytes]]) -> None: "Finish analysing the response, handling any parse errors." self.emit("debug", "fetched %s (%s)" % (self.request.uri, self.check_name)) self.response.transfer_length = self.exchange.input_transfer_length self.response.header_length = self.exchange.input_header_length self.response.body_done(True, trailers) self._fetch_done() def _response_error(self, error: httperr.HttpError) -> None: "Handle an error encountered while fetching the response." self.emit("debug", "fetch error %s (%s) - %s" % ( self.request.uri, self.check_name, error.desc)) err_sample = error.detail[:40] or "" if isinstance(error, httperr.ExtraDataError): if self.response.status_code == "304": self.add_note('body', BODY_NOT_ALLOWED, sample=err_sample) else: self.add_note('body', EXTRA_DATA, sample=err_sample) elif isinstance(error, httperr.ChunkError): self.add_note('header-transfer-encoding', BAD_CHUNK, chunk_sample=err_sample) elif isinstance(error, httperr.HeaderSpaceError): subject = 'header-%s' % (error.detail.lower().strip()) self.add_note(subject, HEADER_NAME_SPACE, header_name=error.detail) else: self.response.http_error = error self._fetch_done() def _fetch_done(self) -> None: if not self.fetch_done: self.fetch_done = True self.exchange = None self.emit("fetch_done")