class RedFetcher(RedState): """ Fetches the given URI (with the provided method, headers and body) and calls: - status_cb as it progresses, and - every function in the body_procs list with each chunk of the body, and - done_cb when all tasks are done. If provided, type indicates the type of the request, and is used to help set notes and status_cb appropriately. The done() method is called when the response is done, NOT when all tasks are done. It can add tasks by calling add_task(). """ client = RedHttpClient() def __init__(self, iri, method="GET", req_hdrs=None, req_body=None, status_cb=None, body_procs=None, name=None): RedState.__init__(self, name) self.request = HttpRequest(self.notes, self.name) self.request.method = method self.request.set_iri(iri) self.request.headers = req_hdrs or [] self.request.payload = req_body self.response = HttpResponse(self.notes, self.name) self.response.is_head_response = (method == "HEAD") self.response.base_uri = self.request.uri self.response.set_decoded_procs(body_procs or []) self.exchange = None self.status_cb = status_cb self.done_cb = None self.outstanding_tasks = 0 self._st = [] # FIXME: this is temporary, for debugging thor def __getstate__(self): state = self.__dict__.copy() del state['exchange'] del state['status_cb'] del state['done_cb'] return state def add_task(self, task, *args): "Remeber that we've started a task." self.outstanding_tasks += 1 self._st.append('add_task(%s)' % str(task)) task(*args, done_cb=self.finish_task) def finish_task(self): "Note that we've finished a task, and see if we're done." self.outstanding_tasks -= 1 self._st.append('finish_task()') assert self.outstanding_tasks >= 0, self._st if self.outstanding_tasks == 0: if self.done_cb: self.done_cb() self.done_cb = None # clean up potentially cyclic references self.status_cb = None def done(self): "Callback for when the response is complete and analysed." raise NotImplementedError def preflight(self): """ Callback to check to see if we should bother running. Return True if so; False if not. """ return True def run(self, done_cb=None): """ Make an asynchronous HTTP request to uri, calling status_cb as it's updated and done_cb when it's done. Reason is used to explain what the request is in the status callback. """ self.outstanding_tasks += 1 self._st.append('run(%s)' % str(done_cb)) self.done_cb = done_cb if not self.preflight() or self.request.uri == None: # generally a good sign that we're not going much further. self.finish_task() return if 'user-agent' not in [i[0].lower() for i in self.request.headers]: self.request.headers.append( (u"User-Agent", u"RED/%s (http://redbot.org/)" % __version__)) self.exchange = self.client.exchange() self.exchange.on('response_start', self._response_start) self.exchange.on('response_body', self._response_body) self.exchange.on('response_done', self._response_done) self.exchange.on('error', self._response_error) if self.status_cb and self.name: self.status_cb("fetching %s (%s)" % ( self.request.uri, self.name )) req_hdrs = [ (k.encode('ascii', 'replace'), v.encode('latin-1', 'replace')) \ for (k, v) in self.request.headers ] self.exchange.request_start( self.request.method, self.request.uri, req_hdrs ) self.request.start_time = thor.time() if self.request.payload != None: self.exchange.request_body(self.request.payload) self.exchange.request_done([]) def _response_start(self, status, phrase, res_headers): "Process the response start-line and headers." self._st.append('_response_start(%s, %s)' % (status, phrase)) self.response.start_time = thor.time() self.response.version = self.exchange.res_version self.response.status_code = status.decode('iso-8859-1', 'replace') self.response.status_phrase = phrase.decode('iso-8859-1', 'replace') self.response.set_headers(res_headers) StatusChecker(self.response, self.request) checkCaching(self.response, self.request) def _response_body(self, chunk): "Process a chunk of the response body." self.response.feed_body(chunk) def _response_done(self, trailers): "Finish analysing the response, handling any parse errors." self._st.append('_response_done()') self.response.complete_time = thor.time() self.response.transfer_length = self.exchange.input_transfer_length self.response.header_length = self.exchange.input_header_length self.response.body_done(True, trailers) if self.status_cb and self.name: self.status_cb("fetched %s (%s)" % ( self.request.uri, self.name )) self.done() self.finish_task() def _response_error(self, error): "Handle an error encountered while fetching the response." self._st.append('_response_error(%s)' % (str(error))) self.response.complete_time = thor.time() self.response.http_error = error if isinstance(error, httperr.BodyForbiddenError): self.add_note('header-none', rs.BODY_NOT_ALLOWED) # elif isinstance(error, httperr.ExtraDataErr): # res.payload_len += len(err.get('detail', '')) elif isinstance(error, httperr.ChunkError): err_msg = error.detail[:20] or "" self.add_note('header-transfer-encoding', rs.BAD_CHUNK, chunk_sample=err_msg.encode('string_escape') ) self.done() self.finish_task()
class RedFetcher(RedState): """ Abstract class for a fetcher. Fetches the given URI (with the provided method, headers and body) and calls: - status_cb as it progresses, and - every function in the body_procs list with each chunk of the body, and - done_cb when all tasks are done. If provided, type indicates the type of the request, and is used to help set notes and status_cb appropriately. The done() method is called when the response is done, NOT when all tasks are done. It can add tasks by calling add_task(). """ client = RedHttpClient() robot_files = {} # cache of robots.txt robot_cache_dir = None robot_lookups = {} def __init__(self, iri, method="GET", req_hdrs=None, req_body=None, status_cb=None, body_procs=None, name=None): RedState.__init__(self, name) self.request = HttpRequest(self.notes, self.name) self.request.method = method self.request.set_iri(iri) self.request.headers = req_hdrs or [] self.request.payload = req_body self.response = HttpResponse(self.notes, self.name) self.response.is_head_response = (method == "HEAD") self.response.base_uri = self.request.uri self.response.set_decoded_procs(body_procs or []) self.exchange = None self.status_cb = status_cb self.done_cb = None # really should be "all tasks done" self.outstanding_tasks = 0 self.follow_robots_txt = True # Should we pay attention to robots file? self._st = [] # FIXME: this is temporary, for debugging thor def __getstate__(self): state = self.__dict__.copy() del state['exchange'] del state['status_cb'] del state['done_cb'] return state def add_task(self, task, *args): "Remeber that we've started a task." self.outstanding_tasks += 1 self._st.append('add_task(%s)' % str(task)) task(*args, done_cb=self.finish_task) def finish_task(self): "Note that we've finished a task, and see if we're done." self.outstanding_tasks -= 1 self._st.append('finish_task()') assert self.outstanding_tasks >= 0, self._st if self.outstanding_tasks == 0: if self.done_cb: self.done_cb() self.done_cb = None # clean up potentially cyclic references self.status_cb = None def done(self): "Callback for when the response is complete and analysed." raise NotImplementedError def preflight(self): """ Callback to check to see if we should bother running. Return True if so; False if not. """ return True def fetch_robots_txt(self, url, cb, network=True): """ Fetch the robots.txt URL and then feed the response to cb. If the status code is not 200, send a blank doc back. If network is False, we won't use the network, will return the result immediately if cached, and will assume it's OK if we don't have a cached file. """ origin = url_to_origin(self.request.uri) if origin == None: cb("") return "" origin_hash = hashlib.sha1(origin).hexdigest() if self.robot_files.has_key(origin): # FIXME: freshness lifetime cb(self.robot_files[origin]) return self.robot_files[origin] if self.robot_cache_dir: robot_fd = CacheFile(path.join(self.robot_cache_dir, origin_hash)) cached_robots_txt = robot_fd.read() if cached_robots_txt != None: cb(cached_robots_txt) return cached_robots_txt if not network: cb("") return "" if self.robot_lookups.has_key(origin): self.robot_lookups[origin].append(cb) else: self.robot_lookups[origin] = [cb] exchange = self.client.exchange() @thor.on(exchange) def response_start(status, phrase, headers): exchange.status = status exchange.res_body = "" @thor.on(exchange) def response_body(chunk): exchange.res_body += chunk @thor.on(exchange) def response_done(trailers): if not exchange.status.startswith("2"): robots_txt = "" else: robots_txt = exchange.res_body self.robot_files[origin] = robots_txt if self.robot_cache_dir: robot_fd = CacheFile( path.join(self.robot_cache_dir, origin_hash)) robot_fd.write(robots_txt, 60*30) for _cb in self.robot_lookups[origin]: _cb(robots_txt) del self.robot_lookups[origin] p_url = urlsplit(url) robots_url = "%s://%s/robots.txt" % (p_url.scheme, p_url.netloc) exchange.request_start("GET", robots_url, [('User-Agent', UA_STRING)]) exchange.request_done([]) def run(self, done_cb=None): """ Make an asynchronous HTTP request to uri, calling status_cb as it's updated and done_cb when it's done. Reason is used to explain what the request is in the status callback. """ self.outstanding_tasks += 1 self._st.append('run(%s)' % str(done_cb)) self.done_cb = done_cb if not self.preflight() or self.request.uri == None: # generally a good sign that we're not going much further. self.finish_task() return if self.follow_robots_txt: self.fetch_robots_txt(self.request.uri, self.run_continue) else: self.run_continue("") def run_continue(self, robots_txt): """ Continue after getting the robots file. TODO: refactor callback style into events. """ if robots_txt == "": # empty or non-200 pass else: checker = RobotFileParser() checker.parse(robots_txt.decode('ascii', 'replace').encode('ascii', 'replace').splitlines()) if not checker.can_fetch(UA_STRING, self.request.uri): self.response.http_error = RobotsTxtError() self.finish_task() return # TODO: show error? if 'user-agent' not in [i[0].lower() for i in self.request.headers]: self.request.headers.append( (u"User-Agent", UA_STRING)) self.exchange = self.client.exchange() self.exchange.on('response_start', self._response_start) self.exchange.on('response_body', self._response_body) self.exchange.on('response_done', self._response_done) self.exchange.on('error', self._response_error) if self.status_cb and self.name: self.status_cb("fetching %s (%s)" % ( self.request.uri, self.name )) req_hdrs = [ (k.encode('ascii', 'replace'), v.encode('latin-1', 'replace')) \ for (k, v) in self.request.headers ] self.exchange.request_start( self.request.method, self.request.uri, req_hdrs ) self.request.start_time = thor.time() if self.request.payload != None: self.exchange.request_body(self.request.payload) self.transfer_out += len(self.request.payload) self.exchange.request_done([]) def _response_start(self, status, phrase, res_headers): "Process the response start-line and headers." self._st.append('_response_start(%s, %s)' % (status, phrase)) self.response.start_time = thor.time() self.response.version = self.exchange.res_version self.response.status_code = status.decode('iso-8859-1', 'replace') self.response.status_phrase = phrase.decode('iso-8859-1', 'replace') self.response.set_headers(res_headers) StatusChecker(self.response, self.request) checkCaching(self.response, self.request) def _response_body(self, chunk): "Process a chunk of the response body." self.transfer_in += len(chunk) self.response.feed_body(chunk) def _response_done(self, trailers): "Finish analysing the response, handling any parse errors." self._st.append('_response_done()') self.response.complete_time = thor.time() self.response.transfer_length = self.exchange.input_transfer_length self.response.header_length = self.exchange.input_header_length self.response.body_done(True, trailers) if self.status_cb and self.name: self.status_cb("fetched %s (%s)" % ( self.request.uri, self.name )) self.done() self.finish_task() def _response_error(self, error): "Handle an error encountered while fetching the response." self._st.append('_response_error(%s)' % (str(error))) self.response.complete_time = thor.time() self.response.http_error = error if isinstance(error, httperr.BodyForbiddenError): self.add_note('header-none', rs.BODY_NOT_ALLOWED) # elif isinstance(error, httperr.ExtraDataErr): # res.payload_len += len(err.get('detail', '')) elif isinstance(error, httperr.ChunkError): err_msg = error.detail[:20] or "" self.add_note('header-transfer-encoding', rs.BAD_CHUNK, chunk_sample=err_msg.encode('string_escape') ) self.done() self.finish_task()
class RedFetcher(thor.events.EventEmitter): """ Abstract class for a fetcher. Fetches the given URI (with the provided method, headers and body) and: - emits 'status' and 'debug' as it progresses - emits 'fetch_done' when the fetch is finished. If provided, 'name' indicates the type of the request, and is used to help set notes and status events appropriately. """ check_name = "undefined" response_phrase = "undefined" client = RedHttpClient() client.idle_timeout = 5 robot_emitter = thor.events.EventEmitter() def __init__(self, config: SectionProxy) -> None: thor.events.EventEmitter.__init__(self) self.config = config self.notes = [] # type: List[Note] self.transfer_in = 0 self.transfer_out = 0 self.request = HttpRequest(self.ignore_note) # type: HttpRequest self.nonfinal_responses = [] # type: List[HttpResponse] self.response = HttpResponse(self.add_note) # type: HttpResponse self.exchange = None # type: thor.http.ClientExchange self.fetch_started = False self.fetch_done = False def __getstate__(self) -> Dict[str, Any]: state = thor.events.EventEmitter.__getstate__(self) del state["exchange"] return state def __repr__(self) -> str: out = [self.__class__.__name__] if self.request.uri: out.append("%s" % self.request.uri) if self.fetch_started: out.append("fetch_started") if self.fetch_done: out.append("fetch_done") return "<%s at %#x>" % (", ".join(out), id(self)) def add_note(self, subject: str, note: Type[Note], **kw: Union[str, int]) -> None: "Set a note." if "response" not in kw: kw["response"] = self.response_phrase self.notes.append(note(subject, kw)) def ignore_note(self, subject: str, note: Type[Note], **kw: str) -> None: "Ignore a note (for requests)." return def preflight(self) -> bool: """ Check to see if we should bother running. Return True if so; False if not. Can be overridden. """ return True def set_request( self, iri: str, method: str = "GET", req_hdrs: StrHeaderListType = None, req_body: bytes = None, ) -> None: """ Set the resource's request. All values are strings. """ self.request.method = method self.response.is_head_response = method == "HEAD" # type: ignore try: self.request.set_iri(iri) except httperr.UrlError as why: self.response.http_error = why self.response.base_uri = self.request.uri # type: ignore if req_hdrs: self.request.set_headers(req_hdrs) self.request.payload = req_body # type: ignore # FIXME: encoding self.request.complete = True # cheating a bit def check(self) -> None: """ Make an asynchronous HTTP request to uri, emitting 'status' as it's updated and 'fetch_done' when it's done. Reason is used to explain what the request is in the status callback. """ if not self.preflight() or self.request.uri is None: # generally a good sign that we're not going much further. self._fetch_done() return self.run_continue(True) def run_continue(self, allowed: bool) -> None: """ Continue after getting the robots file. """ if not allowed: self.response.http_error = RobotsTxtError() self._fetch_done() return self.fetch_started = True if "user-agent" not in [i[0].lower() for i in self.request.headers]: self.request.headers.append(("User-Agent", UA_STRING)) self.exchange = self.client.exchange() self.exchange.on("response_nonfinal", self._response_nonfinal) self.exchange.once("response_start", self._response_start) self.exchange.on("response_body", self._response_body) self.exchange.once("response_done", self._response_done) self.exchange.on("error", self._response_error) self.emit("status", "fetching %s (%s)" % (self.request.uri, self.check_name)) self.emit("debug", "fetching %s (%s)" % (self.request.uri, self.check_name)) req_hdrs = [ (k.encode("ascii", "replace"), v.encode("ascii", "replace")) for (k, v) in self.request.headers ] # FIXME: should complain self.exchange.request_start( self.request.method.encode("ascii"), self.request.uri.encode("ascii"), req_hdrs, ) self.request.start_time = thor.time() if not self.fetch_done: # the request could have immediately failed. if self.request.payload is not None: self.exchange.request_body(self.request.payload) self.transfer_out += len(self.request.payload) if not self.fetch_done: # the request could have immediately failed. self.exchange.request_done([]) def _response_nonfinal(self, status: bytes, phrase: bytes, res_headers: RawHeaderListType) -> None: "Got a non-final response." nfres = HttpResponse(self.add_note) nfres.process_top_line(self.exchange.res_version, status, phrase) nfres.process_raw_headers(res_headers) StatusChecker(nfres, self.request) self.nonfinal_responses.append(nfres) def _response_start(self, status: bytes, phrase: bytes, res_headers: RawHeaderListType) -> None: "Process the response start-line and headers." self.response.start_time = thor.time() self.response.process_top_line(self.exchange.res_version, status, phrase) self.response.process_raw_headers(res_headers) StatusChecker(self.response, self.request) checkCaching(self.response, self.request) def _response_body(self, chunk: bytes) -> None: "Process a chunk of the response body." self.transfer_in += len(chunk) self.response.feed_body(chunk) def _response_done(self, trailers: List[Tuple[bytes, bytes]]) -> None: "Finish analysing the response, handling any parse errors." self.emit("debug", "fetched %s (%s)" % (self.request.uri, self.check_name)) self.response.transfer_length = self.exchange.input_transfer_length self.response.header_length = self.exchange.input_header_length self.response.body_done(True, trailers) self._fetch_done() def _response_error(self, error: httperr.HttpError) -> None: "Handle an error encountered while fetching the response." self.emit( "debug", "fetch error %s (%s) - %s" % (self.request.uri, self.check_name, error.desc), ) err_sample = error.detail[:40] or "" if isinstance(error, httperr.ExtraDataError): if self.response.status_code == "304": self.add_note("body", BODY_NOT_ALLOWED, sample=err_sample) else: self.add_note("body", EXTRA_DATA, sample=err_sample) elif isinstance(error, httperr.ChunkError): self.add_note("header-transfer-encoding", BAD_CHUNK, chunk_sample=err_sample) elif isinstance(error, httperr.HeaderSpaceError): subject = "header-%s" % (error.detail.lower().strip()) self.add_note(subject, HEADER_NAME_SPACE, header_name=error.detail) else: self.response.http_error = error self._fetch_done() def _fetch_done(self) -> None: if not self.fetch_done: self.fetch_done = True self.exchange = None self.emit("fetch_done")
class RedFetcher(thor.events.EventEmitter): """ Abstract class for a fetcher. Fetches the given URI (with the provided method, headers and body) and: - emits 'status' as it progresses - emits 'fetch_done' when the fetch is finished. If provided, 'name' indicates the type of the request, and is used to help set notes and status events appropriately. """ check_name = "undefined" response_phrase = "undefined" client = RedHttpClient() robot_fetcher = RobotFetcher() def __init__(self) -> None: thor.events.EventEmitter.__init__(self) self.notes = [] # type: List[Note] self.transfer_in = 0 self.transfer_out = 0 self.request = HttpRequest(self.ignore_note) # type: HttpRequest self.nonfinal_responses = [] # type: List[HttpResponse] self.response = HttpResponse(self.add_note) # type: HttpResponse self.exchange = None # type: thor.http.ClientExchange self.follow_robots_txt = True # Should we pay attention to robots file? self.fetch_started = False self.fetch_done = False def __getstate__(self) -> Dict[str, Any]: state = thor.events.EventEmitter.__getstate__(self) del state['exchange'] return state def __repr__(self) -> str: out = [self.__class__.__name__] if self.request.uri: out.append("%s" % self.request.uri) if self.fetch_started: out.append("fetch_started") if self.fetch_done: out.append("fetch_done") return "<%s at %#x>" % (", ".join(out), id(self)) def add_note(self, subject: str, note: Type[Note], **kw: Union[str, int]) -> None: "Set a note." if 'response' not in kw: kw['response'] = self.response_phrase self.notes.append(note(subject, kw)) def ignore_note(self, subject: str, note: Type[Note], **kw: str) -> None: "Ignore a note (for requests)." return def preflight(self) -> bool: """ Check to see if we should bother running. Return True if so; False if not. Can be overridden. """ return True def set_request(self, iri: str, method: str="GET", req_hdrs: StrHeaderListType=None, req_body: bytes=None) -> None: """ Set the resource's request. All values are strings. """ self.request.method = method self.response.is_head_response = (method == "HEAD") # type: ignore self.request.set_iri(iri) self.response.base_uri = self.request.uri # type: ignore if req_hdrs: self.request.set_headers(req_hdrs) self.request.payload = req_body # type: ignore # FIXME: encoding self.request.complete = True # cheating a bit def check(self) -> None: """ Make an asynchronous HTTP request to uri, emitting 'status' as it's updated and 'fetch_done' when it's done. Reason is used to explain what the request is in the status callback. """ if not self.preflight() or self.request.uri is None: # generally a good sign that we're not going much further. self._fetch_done() return if self.follow_robots_txt: self.robot_fetcher.once("robot-%s" % self.request.uri, self.run_continue) self.robot_fetcher.check_robots(self.request.uri) else: self.run_continue(True) def run_continue(self, allowed: bool) -> None: """ Continue after getting the robots file. """ if not allowed: self.response.http_error = RobotsTxtError() self._fetch_done() return self.fetch_started = True if 'user-agent' not in [i[0].lower() for i in self.request.headers]: self.request.headers.append(("User-Agent", UA_STRING)) self.exchange = self.client.exchange() self.exchange.on('response_nonfinal', self._response_nonfinal) self.exchange.once('response_start', self._response_start) self.exchange.on('response_body', self._response_body) self.exchange.once('response_done', self._response_done) self.exchange.on('error', self._response_error) self.emit("status", "fetching %s (%s)" % (self.request.uri, self.check_name)) req_hdrs = [(k.encode('ascii'), v.encode('ascii')) for (k, v) in self.request.headers] self.exchange.request_start( self.request.method.encode('ascii'), self.request.uri.encode('ascii'), req_hdrs) self.request.start_time = thor.time() if self.request.payload != None: self.exchange.request_body(self.request.payload) self.transfer_out += len(self.request.payload) self.exchange.request_done([]) def _response_nonfinal(self, status: bytes, phrase: bytes, res_headers: RawHeaderListType) -> None: "Got a non-final response." nfres = HttpResponse(self.add_note) nfres.process_top_line(self.exchange.res_version, status, phrase) nfres.process_raw_headers(res_headers) StatusChecker(nfres, self.request) self.nonfinal_responses.append(nfres) def _response_start(self, status: bytes, phrase: bytes, res_headers: RawHeaderListType) -> None: "Process the response start-line and headers." self.response.start_time = thor.time() self.response.process_top_line(self.exchange.res_version, status, phrase) self.response.process_raw_headers(res_headers) StatusChecker(self.response, self.request) checkCaching(self.response, self.request) def _response_body(self, chunk: bytes) -> None: "Process a chunk of the response body." self.transfer_in += len(chunk) self.response.feed_body(chunk) def _response_done(self, trailers: List[Tuple[bytes, bytes]]) -> None: "Finish analysing the response, handling any parse errors." self.emit("status", "fetched %s (%s)" % (self.request.uri, self.check_name)) self.response.transfer_length = self.exchange.input_transfer_length self.response.header_length = self.exchange.input_header_length self.response.body_done(True, trailers) self._fetch_done() def _response_error(self, error: httperr.HttpError) -> None: "Handle an error encountered while fetching the response." self.emit("status", "fetch error %s (%s) - %s" % ( self.request.uri, self.check_name, error.desc)) err_sample = error.detail[:40] or "" if error.client_recoverable: pass # we'll get to this later. elif isinstance(error, httperr.ExtraDataError): if self.response.status_code == "304": self.add_note('body', BODY_NOT_ALLOWED, sample=err_sample) else: self.add_note('body', EXTRA_DATA, sample=err_sample) elif isinstance(error, httperr.ChunkError): self.add_note('header-transfer-encoding', BAD_CHUNK, chunk_sample=err_sample) else: self.response.http_error = error self._fetch_done() def _fetch_done(self) -> None: if not self.fetch_done: self.fetch_done = True self.emit("fetch_done")