def prepare_config(self): for key in self._pop_config_keys: config.pop(key, None) config.update( { # Default to a uniform retry delay of one second "agent_http_retry_delay_offset": 0.1, "agent_http_retry_delay_factor": 0, "shutting_down": False, "jobtypes": {}, "current_assignments": {}, "agent_id": uuid.uuid4(), "agent_http_persistent_connections": False, "agent_shutdown_timeout": 3, "master": PYFARM_AGENT_MASTER, "agent_hostname": os.urandom(self.RAND_LENGTH).encode("hex"), "agent_ram": memory.total_ram(), "agent_cpus": cpu.total_cpus(), "agent_api_port": randint(10000, 50000), "free_ram": memory.free_ram(), "agent_time_offset": randint(-50, 50), "state": choice(AgentState), "start": time.time(), "agent_pretty_json": False, "agent_html_template_reload": True, "agent_master_reannounce": randint(5, 15), } )
def get(self, **_): # Get counts for child processes and grandchild processes process = psutil.Process() direct_child_processes = len(process.children(recursive=False)) all_child_processes = len(process.children(recursive=True)) grandchild_processes = all_child_processes - direct_child_processes # Determine the last time we talked to the master (if ever) contacted = config.master_contacted(update=False) if isinstance(contacted, datetime): # pragma: no cover contacted = datetime.utcnow() - contacted # Determine the last time we announced ourselves to the # master (if ever) last_announce = config.get("last_announce", None) if isinstance(last_announce, datetime): # pragma: no cover last_announce = datetime.utcnow() - last_announce data = {"state": config["state"], "agent_hostname": config["agent_hostname"], "free_ram": memory.free_ram(), "agent_process_ram": memory.process_memory(), "consumed_ram": memory.total_consumption(), "child_processes": direct_child_processes, "grandchild_processes": grandchild_processes, "pids": config["pids"], "agent_id": config["agent_id"], "last_master_contact": contacted, "last_announce": last_announce, "agent_lock_file": config["agent_lock_file"], "uptime": total_seconds( timedelta(seconds=time.time() - config["start"])), "jobs": list(config["jobtypes"].keys())} if config["farm_name"]: data["farm_name"] = config["farm_name"] return dumps(data)
def callback_agent_id_set( self, change_type, key, new_value, old_value, shutdown_events=True): """ When `agent_id` is created we need to: * Register a shutdown event so that when the agent is told to shutdown it will notify the master of a state change. * Star the scheduled task manager """ if key == "agent_id" and change_type == config.CREATED \ and not self.register_shutdown_events: if shutdown_events: self.register_shutdown_events = True # set the initial free_ram config["free_ram"] = memory.free_ram() config.master_contacted() svclog.debug( "`%s` was %s, adding system event trigger for shutdown", key, change_type) self.repeating_call( config["agent_master_reannounce"], self.reannounce)
def post_shutdown_to_master(self): """ This method is called before the reactor shuts down and lets the master know that the agent's state is now ``offline`` """ # We're under the assumption that something's wrong with # our code if we try to call this method before self.shutting_down # is set. assert self.shutting_down yield self.post_shutdown_lock.acquire() svclog.info("Informing master of shutdown") # Because post_shutdown_to_master is blocking and needs to # stop the reactor from finishing we perform the retry in-line data = None tries = 0 num_retry_errors = 0 response = None timed_out = False while True: tries += 1 try: response = yield post_direct( self.agent_api(), data={ "state": AgentState.OFFLINE, "free_ram": memory.free_ram(), "current_assignments": config["current_assignments"]}) except (ResponseNeverReceived, RequestTransmissionFailed) as error: num_retry_errors += 1 if num_retry_errors > config["broken_connection_max_retry"]: svclog.error( "Failed to post shutdown to the master, " "caught try-again errors %s times in a row.", num_retry_errors) break elif self.shutdown_timeout < datetime.utcnow(): svclog.error("While posting shutdown to master, caught " "%s. Shutdown timeout has been reached, not " "retrying.", error.__class__.__name__) break else: svclog.debug("While posting shutdown to master, caught " "%s. Retrying immediately.", error.__class__.__name__) # When we get a hard failure it could be an issue with the # server, although it's unlikely, so we retry. Only retry # for a set period of time though since the shutdown as a timeout except Exception as failure: if self.shutdown_timeout > datetime.utcnow(): delay = http_retry_delay() svclog.warning( "State update failed due to unhandled error: %s. " "Retrying in %s seconds", failure, delay) # Wait for 'pause' to fire, introducing a delay pause = Deferred() reactor.callLater(delay, pause.callback, None) yield pause else: timed_out = True svclog.warning( "State update failed due to unhandled error: %s. " "Shutdown timeout reached, not retrying.", failure) break else: data = yield treq.json_content(response) if response.code == NOT_FOUND: svclog.warning( "Agent %r no longer exists, cannot update state.", config["agent_id"]) break elif response.code == OK: svclog.info( "Agent %r has POSTed shutdown state change " "successfully.", config["agent_id"]) break elif response.code >= INTERNAL_SERVER_ERROR: if self.shutdown_timeout > datetime.utcnow(): delay = http_retry_delay() svclog.warning( "State update failed due to server error: %s. " "Retrying in %s seconds.", data, delay) # Wait for 'pause' to fire, introducing a delay pause = Deferred() reactor.callLater(delay, pause.callback, None) yield pause else: timed_out = True svclog.warning( "State update failed due to server error: %s. " "Shutdown timeout reached, not retrying.", data) break yield self.post_shutdown_lock.release() extra_data = { "response": response, "timed_out": timed_out, "tries": tries, "retry_errors": num_retry_errors } if isinstance(data, dict): data.update(extra_data) else: data = extra_data returnValue(data)
def system_data(self, requery_timeoffset=False): """ Returns a dictionary of data containing information about the agent. This is the information that is also passed along to the master. """ # query the time offset and then cache it since # this is typically a blocking operation if config["agent_time_offset"] == "auto": config["agent_time_offset"] = None if requery_timeoffset or config["agent_time_offset"] is None: ntplog.info( "Querying ntp server %r for current time", config["agent_ntp_server"]) ntp_client = NTPClient() try: pool_time = ntp_client.request( config["agent_ntp_server"], version=config["agent_ntp_server_version"]) except Exception as e: ntplog.warning("Failed to determine network time: %s", e) else: config["agent_time_offset"] = \ int(pool_time.tx_time - time.time()) # format the offset for logging purposes utcoffset = datetime.utcfromtimestamp(pool_time.tx_time) iso_timestamp = utcoffset.isoformat() ntplog.debug( "network time: %s (local offset: %r)", iso_timestamp, config["agent_time_offset"]) if config["agent_time_offset"] != 0: ntplog.warning( "Agent is %r second(s) off from ntp server at %r", config["agent_time_offset"], config["agent_ntp_server"]) data = { "id": config["agent_id"], "hostname": config["agent_hostname"], "version": config.version, "os_class": system.operating_system(), "os_fullname": platform(), "ram": int(config["agent_ram"]), "cpus": config["agent_cpus"], "cpu_name": cpu.cpu_name(), "port": config["agent_api_port"], "free_ram": memory.free_ram(), "time_offset": config["agent_time_offset"] or 0, "state": config["state"], "mac_addresses": list(network.mac_addresses()), "current_assignments": config.get( "current_assignments", {}), # may not be set yet "disks": disks.disks(as_dict=True) } try: gpu_names = graphics.graphics_cards() data["gpus"] = gpu_names except graphics.GPULookupError: pass if "remote_ip" in config: data.update(remote_ip=config["remote_ip"]) if config["farm_name"]: data["farm_name"] = config["farm_name"] return data
def reannounce(self, force=False): """ Method which is used to periodically contact the master. This method is generally called as part of a scheduled task. """ # Attempt to acquire the reannounce lock but fail after 70% # of the total time between reannouncements elapses. This should # help prevent an accumulation of requests in the event the master # is having issues. try: yield self.reannounce_lock.acquire( config["agent_master_reannounce"] * .70 ) except utility.LockTimeoutError: svclog.debug("Timed out while waiting to acquire reannounce_lock") returnValue(None) if not self.should_reannounce() and not force: yield self.reannounce_lock.release() returnValue(None) svclog.debug("Announcing %s to master", config["agent_hostname"]) data = None num_retry_errors = 0 while True: # for retries try: response = yield post_direct( self.agent_api(), data={ "state": config["state"], "current_assignments": config.get( "current_assignments", {} # may not be set yet ), "free_ram": memory.free_ram(), "disks": disks.disks(as_dict=True) } ) except (ResponseNeverReceived, RequestTransmissionFailed) as error: num_retry_errors += 1 if num_retry_errors > config["broken_connection_max_retry"]: svclog.error( "Failed to announce self to the master, " "caught try-again type errors %s times in a row.", num_retry_errors) break else: svclog.debug("While announcing self to master, caught " "%s. Retrying immediately.", error.__class__.__name__) except Exception as error: if force: delay = http_retry_delay() svclog.error( "Failed to announce self to the master: %s. Will " "retry in %s seconds.", error, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: # Don't retry because reannounce is called periodically svclog.error( "Failed to announce self to the master: %s. This " "request will not be retried.", error) break else: data = yield treq.json_content(response) if response.code == OK: config.master_contacted(announcement=True) svclog.info("Announced self to the master server.") break elif response.code >= INTERNAL_SERVER_ERROR: if not self.shutting_down: delay = http_retry_delay() svclog.warning( "Could not announce self to the master server, " "internal server error: %s. Retrying in %s " "seconds.", data, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: svclog.warning( "Could not announce to master. Not retrying " "because of pending shutdown.") break elif response.code == NOT_FOUND: svclog.warning("The master says it does not know about our " "agent id. Posting as a new agent.") yield self.post_agent_to_master() break # If this is a client problem retrying the request # is unlikely to fix the issue so we stop here elif response.code >= BAD_REQUEST: svclog.error( "Failed to announce self to the master, bad " "request: %s. This request will not be retried.", data) break else: svclog.error( "Unhandled error when posting self to the " "master: %s (code: %s). This request will not be " "retried.", data, response.code) break yield self.reannounce_lock.release() returnValue(data)
def test_ramfree(self): v1 = convert.bytetomb(psutil.virtual_memory().available) v2 = memory.free_ram() self.assertApproximates(v1, v2, 5)
def test_ramused(self): v1 = memory.total_ram() - memory.free_ram() v2 = memory.used_ram() self.assertApproximates(v1, v2, 5)
def post(self, **kwargs): if request_from_master(kwargs["request"]): config.master_contacted() request = kwargs["request"] request_data = kwargs["data"] # First, get the resources we have *right now*. In some cases # this means using the functions in pyfarm.core.sysinfo because # entries in `config` could be slightly out of sync with the system. memory_free = free_ram() cpus = config["agent_cpus"] requires_ram = request_data["job"].get("ram") requires_cpus = request_data["job"].get("cpus") if ("agent_id" in request_data and request_data["agent_id"] != config["agent_id"]): logger.error("Wrong agent_id in assignment: %s. Our id is %s", request_data["agent_id"], config["agent_id"]) return ( dumps({"error": "You have the wrong agent. " "I am %s." % config["agent_id"], "agent_id": config["agent_id"]}), BAD_REQUEST ) elif self.agent.reannounce_lock.locked: logger.warning("Temporarily rejecting assignment because we " "are in the middle of a reannounce.") return ( dumps({"error": "Agent cannot accept assignments because of a " "reannounce in progress. Try again shortly."}), SERVICE_UNAVAILABLE ) elif self.agent.shutting_down: logger.error("Rejecting assignment because the agent is in the " "process of shutting down.") return ( dumps({"error": "Agent cannot accept assignments because it is " "shutting down."}), SERVICE_UNAVAILABLE ) elif "restart_requested" in config \ and config["restart_requested"] is True: logger.error("Rejecting assignment because of scheduled restart.") return ( dumps({"error": "Agent cannot accept assignments because of a " "pending restart."}), SERVICE_UNAVAILABLE ) elif "agent_id" not in config: logger.error( "Agent has not yet connected to the master or `agent_id` " "has not been set yet.") return ( dumps({"error": "agent_id has not been set in the config"}), SERVICE_UNAVAILABLE ) # Do we have enough ram? elif requires_ram is not None and requires_ram > memory_free: logger.error( "Task %s requires %sMB of ram, this agent has %sMB free. " "Rejecting Task %s.", request_data["job"]["id"], requires_ram, memory_free, request_data["job"]["id"]) config["free_ram"] = memory_free return ( dumps({"error": "Not enough ram", "agent_ram": memory_free, "requires_ram": requires_ram}), BAD_REQUEST ) # Do we have enough cpus (count wise)? elif requires_cpus is not None and requires_cpus > cpus: logger.error( "Task %s requires %s CPUs, this agent has %s CPUs. " "Rejecting Task %s.", request_data["job"]["id"], requires_cpus, cpus, request_data["job"]["id"]) return ( dumps({"error": "Not enough cpus", "agent_cpus": cpus, "requires_cpus": requires_cpus}), BAD_REQUEST ) new_task_ids = set(task["id"] for task in request_data["tasks"]) for assignment in config["current_assignments"].itervalues(): existing_task_ids = set(x["id"] for x in assignment["tasks"]) # If the assignment is identical to one we already have if existing_task_ids == new_task_ids: logger.debug( "Ignoring repeated assignment of the same batch") return dumps({"id": assignment["id"]}), ACCEPTED # If there is only a partial overlap elif existing_task_ids & new_task_ids: logger.error("Rejecting assignment with partial overlap with " "existing assignment.") unknown_task_ids = new_task_ids - existing_task_ids return ( dumps({"error": "Partial overlap of tasks", "rejected_task_ids": list(unknown_task_ids)}), CONFLICT ) if not config["agent_allow_sharing"]: for jobtype in config["jobtypes"].itervalues(): num_finished_tasks = (len(jobtype.finished_tasks) + len(jobtype.failed_tasks)) if len(jobtype.assignment["tasks"]) > num_finished_tasks: logger.error("Rejecting an assignment that would require " "agent sharing") return ( dumps({ "error": "Agent does not allow multiple " "assignments", "rejected_task_ids": list(new_task_ids)}), CONFLICT ) assignment_uuid = uuid4() request_data.update(id=assignment_uuid) config["current_assignments"][assignment_uuid] = request_data logger.debug("Accepted assignment %s: %r", assignment_uuid, request_data) logger.info("Accept assignment from job %s with %s tasks", request_data["job"]["title"], len(request_data["tasks"])) def assignment_failed(result, assign_id): logger.error( "Assignment %s failed, result: %r, removing.", assign_id, result) logger.error(result.getTraceback()) if (len(config["current_assignments"]) <= 1 and not self.agent.shutting_down): config["state"] = AgentState.ONLINE self.agent.reannounce(force=True) # Do not mark the assignment as failed if the reason for failing # was that we ran out of disk space failed = not isinstance(result.value, InsufficientSpaceError) assignment = config["current_assignments"].pop(assign_id) if "jobtype" in assignment: jobtype_id = assignment["jobtype"].pop("id", None) if jobtype_id: instance = config["jobtypes"].pop(jobtype_id, None) instance.stop( assignment_failed=failed, avoid_reassignment=not failed, error="Error in jobtype: %r. " "Traceback: %s" % (result, traceback.format_exc())) def assignment_started(_, assign_id): logger.debug("Assignment %s has started", assign_id) config["state"] = AgentState.RUNNING self.agent.reannounce(force=True) def remove_assignment(_, assign_id): assignment = config["current_assignments"].pop(assign_id) if "jobtype" in assignment: jobtype_id = assignment["jobtype"].pop("id", None) if jobtype_id: config["jobtypes"].pop(jobtype_id, None) def assignment_stopped(_, assign_id): logger.debug("Assignment %s has stopped", assign_id) if (len(config["current_assignments"]) <= 1 and not self.agent.shutting_down): config["state"] = AgentState.ONLINE self.agent.reannounce(force=True) assignment = config["current_assignments"][assign_id] if "jobtype" in assignment: jobtype_id = assignment["jobtype"].pop("id", None) if jobtype_id: jobtype = config["jobtypes"].pop(jobtype_id, None) updates_deferred = DeferredList( jobtype.task_update_deferreds) updates_deferred.addBoth(remove_assignment, assign_id) else: config["current_assignments"].pop(assign_id) def restart_if_necessary(_): # pragma: no cover if "restart_requested" in config and config["restart_requested"]: stopping = config["agent"].stop() stopping.addCallbacks(lambda _: reactor.stop(), lambda _: reactor.stop()) def load_jobtype_failed(result, assign_id): logger.error( "Loading jobtype for assignment %s failed, removing.", assign_id) traceback = result.getTraceback() logger.debug("Got traceback") logger.error(traceback) assignment = config["current_assignments"].pop(assign_id) # Mark all tasks as failed on master and set an error message logger.debug("Marking tasks in assignment as failed") def post_update(post_url, post_data, task, delay=0): post_func = partial(post, post_url, data=post_data, callback=lambda x: result_callback( post_url, post_data, task, x), errback=lambda x: error_callback( post_url, post_data, task, x)) reactor.callLater(delay, post_func) def result_callback(cburl, cbdata, task, response): if 500 <= response.code < 600: logger.error( "Error while marking task %s as failed on master, " "retrying", task["id"]) post_update(cburl, cbdata, task, delay=http_retry_delay()) elif response.code != OK: logger.error( "Could not mark task %s as failed, server response " "code was %s", task["id"], response.code) else: logger.info( "Marked task %s as failed on master", task["id"]) def error_callback(cburl, cbdata, task, failure_reason): logger.error( "Error while marking task %s as failed, retrying", task["id"], failure_reason) post_update(cburl, cbdata, task, delay=http_retry_delay()) for task in assignment["tasks"]: url = "%s/jobs/%s/tasks/%s" % ( config["master_api"], assignment["job"]["id"], task["id"]) data = { "state": WorkState.FAILED, "last_error": traceback} post_update(url, data, task) # If the loading was partially successful for some reason, there # might already be an entry for this jobtype in the config. # Remove it if it exists. if "jobtype" in assignment: jobtype_id = assignment["jobtype"].pop("id", None) if jobtype_id: config["jobtypes"].pop(jobtype_id, None) def loaded_jobtype(jobtype_class, assign_id): # TODO: report error to master if hasattr(jobtype_class, "getTraceback"): logger.error(jobtype_class.getTraceback()) return # TODO: add call to prepare_for_job # TODO: add call to spawn_persistent_process # Instance the job type and pass in the assignment data. instance = jobtype_class(request_data) if not isinstance(instance, JobType): raise TypeError( "Expected a subclass of " "pyfarm.jobtypes.core.jobtype.JobType") # TODO: add callback to cleanup_after_job # TODO: add callback to stop persistent process try: started_deferred, stopped_deferred = instance._start() started_deferred.addCallback(assignment_started, assign_id) started_deferred.addErrback(assignment_failed, assign_id) stopped_deferred.addCallback(assignment_stopped, assign_id) stopped_deferred.addErrback(assignment_failed, assign_id) stopped_deferred.addBoth(restart_if_necessary) stopped_deferred.addBoth( lambda *args: instance._remove_tempdirs()) stopped_deferred.addBoth( lambda *args: instance._close_logs()) stopped_deferred.addBoth( lambda *args: instance._upload_logfile()) except Exception as e: logger.error("Error on starting jobtype, stopping it now. " "Error was: %r. Traceback: %s", e, traceback.format_exc()) instance.stop(assignment_failed=True, error="Error while loading jobtype: %r. " "Traceback: %s" % (e, traceback.format_exc())) assignment = config["current_assignments"].pop(assign_id) if "jobtype" in assignment: jobtype_id = assignment["jobtype"].pop("id", None) if jobtype_id: config["jobtypes"].pop(jobtype_id, None) # Load the job type then pass the class along to the # callback. No errback here because all the errors # are handled internally in this case. jobtype_loader = JobType.load(request_data) jobtype_loader.addCallback(loaded_jobtype, assignment_uuid) jobtype_loader.addErrback(load_jobtype_failed, assignment_uuid) return dumps({"id": assignment_uuid}), ACCEPTED