def test_config_defaults(self): config["agent_http_retry_delay_factor"] = 0 config["agent_http_retry_delay_offset"] = 0 self.assertEqual(http_retry_delay(), 0) config["agent_http_retry_delay_factor"] = 0 config["agent_http_retry_delay_offset"] = 1 self.assertEqual(http_retry_delay(), 1)
def mark_software_available(software, version): url = "{master_api}/agents/{agent}/software/".format( master_api=config.get("master_api"), agent=config.get("agent_id")) while True: try: response = yield post_direct(url, data={ "software": software, "version": version}) except Exception as error: delay = http_retry_delay() logger.error( "Failed to post availability of software %s, " "version %s to master: %r. Will retry in %s " "seconds.", software, version, error, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: data = yield treq.content(response) if response.code == OK: logger.info("Posted availability of software %s, " "version %s to master.", software, version) break elif response.code >= INTERNAL_SERVER_ERROR: delay = http_retry_delay() logger.warning( "Could not post availability of software %s, " "version %s. The master responded with " "INTERNAL_SERVER_ERROR. Retrying in %s " "seconds.", software, version, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: logger.error( "Failed to post availability of software %s, " "version %s: " "Unexpected status from server %s. Data: %s", software, version, response.code, data) break if self.testing: self.operation_deferred.callback(None)
def mark_software_not_available(software, version): url = ("{master_api}/agents/{agent}/software/{software}/" "versions/{version}").format( master_api=config.get("master_api"), agent=config.get("agent_id"), software=software, version=version) while True: try: response = yield delete_direct(url) except Exception as error: delay = http_retry_delay() logger.error( "Failed to remove software %s, version %s from this " "agent on master: %r. Will retry in %s seconds.", software, version, error, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: data = yield treq.content(response) if response.code in [OK, ACCEPTED, NO_CONTENT]: logger.info("Removed software %s, version %s from this " "agent on master.", software, version) break elif response.code >= INTERNAL_SERVER_ERROR: delay = http_retry_delay() logger.warning( "Could not remove software %s, version %s from " "this agent. The master responded with " "INTERNAL_SERVER_ERROR. Retrying in %s " "seconds.", software, version, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: logger.error( "Failed to remove software %s, version %s from " "this agent: " "Unexpected status from server %s. Data: %s", software, version, response.code, data) break if self.testing: self.operation_deferred.callback(None)
def update_failed(arg): logger.error("Downloading update failed: %r", arg) if isinstance(arg, twisted.python.failure.Failure): logger.error("Traceback: %s", arg.getTraceback()) reactor.callLater(http_retry_delay(), download_update, data["version"])
def result_callback(url, log_identifier, response): if 500 <= response.code < 600: delay = http_retry_delay() logger.error( "Server side error while uploading log file %s, " "status code: %s. Retrying. in %s seconds", log_identifier, response.code, delay, ) upload(url, log_identifier, delay=delay) elif response.code not in [OK, CREATED, ACCEPTED]: # Nothing else we could do about that, this is # a problem on our end. logger.error( "Could not upload logfile %s status code: %s. " "This is a client side error, giving up.", log_identifier, response.code, ) try: upload_deferred.errback(ValueError("Bad return code on uploading logfile: %s" % response.code)) except Exception as e: logger.error("Caught exception calling upload_deferred.errback: %s", e) else: logger.info("Uploaded logfile %s for to master", log_identifier) try: upload_deferred.callback(None) except Exception as e: logger.error("Caught exception calling upload_deferred.callback: %s", e)
def error_callback(task, log_path, deferred, num_retry_errors, failure_reason): if num_retry_errors > config["broken_connection_max_retry"]: logger.error( "Error while registering logfile %s for task %s on master. " "Maximum number of retries reached. Not retrying the " "request.", log_path, task["id"], ) deferred.errback(None) else: if failure_reason.type in (ResponseNeverReceived, RequestTransmissionFailed): logger.debug( "Error while registering logfile %s for task %s on " "master: %s, retrying immediately", log_path, task["id"], failure_reason.type.__name__, ) post_logfile(task, log_path, post_deferred=deferred) else: delay = http_retry_delay() logger.error( "Error while registering logfile %s for task %s on " "master: %r, retrying in %s seconds.", log_path, task["id"], failure_reason, delay, ) post_logfile(task, log_path, post_deferred=deferred, delay=delay)
def result_callback(cburl, cbdata, task, response): if 500 <= response.code < 600: logger.error( "Error while marking task %s as failed on master, " "retrying", task["id"]) post_update(cburl, cbdata, task, delay=http_retry_delay()) elif response.code != OK: logger.error( "Could not mark task %s as failed, server response " "code was %s", task["id"], response.code) else: logger.info( "Marked task %s as failed on master", task["id"])
def _download_jobtype(cls, name, version): """ Downloads the job type specified in ``assignment``. This method will pass the response it receives to :meth:`_cache_jobtype` however failures will be retried. """ logger.debug("Downloading job type %r version %s", name, version) url = str(cls.JOBTYPE_VERSION_URL % {"master_api": config["master_api"], "name": name, "version": version}) result = Deferred() download = lambda *_: get( url, callback=result.callback, errback=lambda *_: reactor.callLater(http_retry_delay(), download) ) download() return result
def error_callback(url, log_identifier, failure_reason): if failure_reason.type in (ResponseNeverReceived, RequestTransmissionFailed): logger.debug( "Error while uploading logfile %s to master: " "%s, retrying immediately", log_identifier, failure_reason.type.__name__, ) upload(url, log_identifier) else: delay = http_retry_delay() logger.error( "Error while uploading logfile %s to master: " "%r, retrying in %s seconds.", log_identifier, failure_reason, delay, ) upload(url, log_identifier, delay=delay)
def _jobtype_download_complete(cls, response, cache_key): # Server is offline or experiencing issues right # now so we should retry the request. if response.code >= INTERNAL_SERVER_ERROR: logger.debug("Could not download jobtype because of internal server error.") return reactor.callLater(http_retry_delay(), response.request.retry) downloaded_data = response.json() if not config["jobtype_enable_cache"]: logger.debug("Jobtype cache is disabled, loading the jobtype.") return cls._load_jobtype(downloaded_data, None) else: # When the download is complete, cache the results logger.debug("Caching the jobtype locally.") caching = cls._cache_jobtype(cache_key, downloaded_data) caching.addCallback(lambda result: cls._load_jobtype(*result)) return caching
def result_callback(task, log_path, deferred, response): if 500 <= response.code < 600: delay = http_retry_delay() logger.error( "Server side error while registering log file %s for " "task %s (frame %s) in job %s (id %s), status code: %s. " "Retrying in %s seconds", log_path, task["id"], task["frame"], self.assignment["job"]["title"], self.assignment["job"]["id"], response.code, delay, ) post_logfile(task, log_path, post_deferred=deferred, delay=delay) # The server will return CONFLICT if we try to register a logfile # twice. elif response.code not in [OK, CONFLICT, CREATED]: # Nothing else we could do about that, this is # a problem on our end. logger.error( "Could not register logfile %s for task %s (frame %s) in " "job %s (id %s), status code: %s. This is a client side " "error, giving up.", log_path, task["id"], task["frame"], self.assignment["job"]["title"], self.assignment["job"]["id"], response.code, ) deferred.errback(None) else: logger.info("Registered logfile %s for task %s on master", log_path, task["id"]) deferred.callback(None)
def post_agent_to_master(self): """ Runs the POST request to contact the master. Running this method multiple times should be considered safe but is generally something that should be avoided. """ url = self.agents_endpoint() data = self.system_data() try: response = yield post_direct(url, data=data) except Exception as failure: delay = http_retry_delay() if isinstance(failure, ConnectionRefusedError): svclog.error( "Failed to POST agent to master, the connection was " "refused. Retrying in %s seconds", delay) else: # pragma: no cover svclog.error( "Unhandled error when trying to POST the agent to the " "master. The error was %s.", failure) if not self.shutting_down: svclog.info( "Retrying failed POST to master in %s seconds.", delay) yield deferLater(reactor, delay, self.post_agent_to_master) else: svclog.warning("Not retrying POST to master, shutting down.") else: # Master might be down or have some other internal problems # that might eventually be fixed. Retry the request. if response.code >= INTERNAL_SERVER_ERROR: if not self.shutting_down: delay = http_retry_delay() svclog.warning( "Failed to post to master due to a server side error " "error %s, retrying in %s seconds", response.code, delay) yield deferLater(reactor, delay, self.post_agent_to_master) else: svclog.warning( "Failed to post to master due to a server side error " "error %s. Not retrying, because the agent is " "shutting down", response.code) # Master is up but is rejecting our request because there's # something wrong with it. Do not retry the request. elif response.code >= BAD_REQUEST: text = yield response.text() svclog.error( "%s accepted our POST request but responded with code %s " "which is a client side error. The message the server " "responded with was %r. Sorry, but we cannot retry this " "request as it's an issue with the agent's request.", url, response.code, text) else: data = yield treq.json_content(response) config["agent_id"] = data["id"] config.master_contacted() if response.code == OK: svclog.info( "POST to %s was successful. Agent %s was updated.", url, config["agent_id"]) elif response.code == CREATED: svclog.info( "POST to %s was successful. A new agent " "with an id of %s was created.", url, config["agent_id"]) returnValue(data)
def post_shutdown_to_master(self): """ This method is called before the reactor shuts down and lets the master know that the agent's state is now ``offline`` """ # We're under the assumption that something's wrong with # our code if we try to call this method before self.shutting_down # is set. assert self.shutting_down yield self.post_shutdown_lock.acquire() svclog.info("Informing master of shutdown") # Because post_shutdown_to_master is blocking and needs to # stop the reactor from finishing we perform the retry in-line data = None tries = 0 num_retry_errors = 0 response = None timed_out = False while True: tries += 1 try: response = yield post_direct( self.agent_api(), data={ "state": AgentState.OFFLINE, "free_ram": memory.free_ram(), "current_assignments": config["current_assignments"]}) except (ResponseNeverReceived, RequestTransmissionFailed) as error: num_retry_errors += 1 if num_retry_errors > config["broken_connection_max_retry"]: svclog.error( "Failed to post shutdown to the master, " "caught try-again errors %s times in a row.", num_retry_errors) break elif self.shutdown_timeout < datetime.utcnow(): svclog.error("While posting shutdown to master, caught " "%s. Shutdown timeout has been reached, not " "retrying.", error.__class__.__name__) break else: svclog.debug("While posting shutdown to master, caught " "%s. Retrying immediately.", error.__class__.__name__) # When we get a hard failure it could be an issue with the # server, although it's unlikely, so we retry. Only retry # for a set period of time though since the shutdown as a timeout except Exception as failure: if self.shutdown_timeout > datetime.utcnow(): delay = http_retry_delay() svclog.warning( "State update failed due to unhandled error: %s. " "Retrying in %s seconds", failure, delay) # Wait for 'pause' to fire, introducing a delay pause = Deferred() reactor.callLater(delay, pause.callback, None) yield pause else: timed_out = True svclog.warning( "State update failed due to unhandled error: %s. " "Shutdown timeout reached, not retrying.", failure) break else: data = yield treq.json_content(response) if response.code == NOT_FOUND: svclog.warning( "Agent %r no longer exists, cannot update state.", config["agent_id"]) break elif response.code == OK: svclog.info( "Agent %r has POSTed shutdown state change " "successfully.", config["agent_id"]) break elif response.code >= INTERNAL_SERVER_ERROR: if self.shutdown_timeout > datetime.utcnow(): delay = http_retry_delay() svclog.warning( "State update failed due to server error: %s. " "Retrying in %s seconds.", data, delay) # Wait for 'pause' to fire, introducing a delay pause = Deferred() reactor.callLater(delay, pause.callback, None) yield pause else: timed_out = True svclog.warning( "State update failed due to server error: %s. " "Shutdown timeout reached, not retrying.", data) break yield self.post_shutdown_lock.release() extra_data = { "response": response, "timed_out": timed_out, "tries": tries, "retry_errors": num_retry_errors } if isinstance(data, dict): data.update(extra_data) else: data = extra_data returnValue(data)
def reannounce(self, force=False): """ Method which is used to periodically contact the master. This method is generally called as part of a scheduled task. """ # Attempt to acquire the reannounce lock but fail after 70% # of the total time between reannouncements elapses. This should # help prevent an accumulation of requests in the event the master # is having issues. try: yield self.reannounce_lock.acquire( config["agent_master_reannounce"] * .70 ) except utility.LockTimeoutError: svclog.debug("Timed out while waiting to acquire reannounce_lock") returnValue(None) if not self.should_reannounce() and not force: yield self.reannounce_lock.release() returnValue(None) svclog.debug("Announcing %s to master", config["agent_hostname"]) data = None num_retry_errors = 0 while True: # for retries try: response = yield post_direct( self.agent_api(), data={ "state": config["state"], "current_assignments": config.get( "current_assignments", {} # may not be set yet ), "free_ram": memory.free_ram(), "disks": disks.disks(as_dict=True) } ) except (ResponseNeverReceived, RequestTransmissionFailed) as error: num_retry_errors += 1 if num_retry_errors > config["broken_connection_max_retry"]: svclog.error( "Failed to announce self to the master, " "caught try-again type errors %s times in a row.", num_retry_errors) break else: svclog.debug("While announcing self to master, caught " "%s. Retrying immediately.", error.__class__.__name__) except Exception as error: if force: delay = http_retry_delay() svclog.error( "Failed to announce self to the master: %s. Will " "retry in %s seconds.", error, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: # Don't retry because reannounce is called periodically svclog.error( "Failed to announce self to the master: %s. This " "request will not be retried.", error) break else: data = yield treq.json_content(response) if response.code == OK: config.master_contacted(announcement=True) svclog.info("Announced self to the master server.") break elif response.code >= INTERNAL_SERVER_ERROR: if not self.shutting_down: delay = http_retry_delay() svclog.warning( "Could not announce self to the master server, " "internal server error: %s. Retrying in %s " "seconds.", data, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: svclog.warning( "Could not announce to master. Not retrying " "because of pending shutdown.") break elif response.code == NOT_FOUND: svclog.warning("The master says it does not know about our " "agent id. Posting as a new agent.") yield self.post_agent_to_master() break # If this is a client problem retrying the request # is unlikely to fix the issue so we stop here elif response.code >= BAD_REQUEST: svclog.error( "Failed to announce self to the master, bad " "request: %s. This request will not be retried.", data) break else: svclog.error( "Unhandled error when posting self to the " "master: %s (code: %s). This request will not be " "retried.", data, response.code) break yield self.reannounce_lock.release() returnValue(data)
def test_invalid_type_factor(self): with self.assertRaises(AssertionError): http_retry_delay(factor="")
def test_invalid_type_offset(self): with self.assertRaises(AssertionError): http_retry_delay(offset="")
def test_custom_offset_and_factor(self): self.assertEqual( http_retry_delay(offset=1, factor=1, rand=lambda: 1), 2)
def get_software_version_data(software, version): """ Asynchronously fetches the known data about the given software version from the master. :param str software: The name of the software to get data for :param str version: The name of the version to get data for :return: Returns information about the given software version from the master """ url = "{master_api}/software/{software}/versions/{version}".\ format(master_api=config.get("master_api"), software=software, version=version) while True: try: response = yield get_direct(url) except Exception as error: delay = http_retry_delay() logger.error( "Failed to get data about software %s, version %s: %r. Will " "retry in %s seconds.", software, version, error, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: if response.code == OK: data = yield treq.json_content(response) returnValue(data) elif response.code >= INTERNAL_SERVER_ERROR: delay = http_retry_delay() logger.warning( "Could not get data for software %s, version %s, server " "responded with INTERNAL_SERVER_ERROR. Retrying in %s " "seconds.", software, version, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred elif response.code == NOT_FOUND: logger.error("Got 404 NOT FOUND from server on getting data " "for software %s, version %s", software, version) raise VersionNotFound("This software version was not found or " "has no discovery code.") else: logger.error( "Failed to get data for software %s, version %s: " "Unexpected status from server %s", software, version, response.code) raise Exception("Unknown return code from master: %s" % response.code)
def error_callback(cburl, cbdata, task, failure_reason): logger.error( "Error while marking task %s as failed, retrying", task["id"], failure_reason) post_update(cburl, cbdata, task, delay=http_retry_delay())