def get(self, **kwargs): request = kwargs.get("request") if request is not None and request_from_master(request): config.master_contacted() return dumps(versions=[1]), OK
def test_get_config(self): request = self.get() config_ = Config() response = config_.render(request) self.assertEqual(response, NOT_DONE_YET) self.assertTrue(request.finished) self.assertEqual(request.responseCode, OK) self.assertEqual(len(request.written), 1) response = loads(request.written[0]) response.pop("last_master_contact") current_config = config.copy() current_config.pop("last_master_contact") response.pop("agent") for key in response: self.assertIn(key, current_config) # HTTP responses are not automatically # converted from plain text into UUID # objects. if key == "agent_id": response[key] = UUID(response[key]) self.assertEqual( response[key], current_config[key], "Data for key %r %r != %r" % ( key, response[key], current_config[key])) self.assertDateAlmostEqual( config.master_contacted(update=False), datetime.utcnow())
def test_get_result(self): process = psutil.Process() direct_child_processes = len(process.children(recursive=False)) all_child_processes = len(process.children(recursive=True)) grandchild_processes = all_child_processes - direct_child_processes # Determine the last time we talked to the master (if ever) contacted = config.master_contacted(update=False) if isinstance(contacted, datetime): contacted = datetime.utcnow() - contacted # Determine the last time we announced ourselves to the # master (if ever) last_announce = config.get("last_announce", None) if isinstance(last_announce, datetime): last_announce = datetime.utcnow() - last_announce future_time = config["start"] + 30 process_memory = memory.process_memory() total_consumption = memory.total_consumption() expected_data = { "state": config["state"], "agent_hostname": config["agent_hostname"], "agent_process_ram": process_memory, "consumed_ram": total_consumption, "child_processes": direct_child_processes, "grandchild_processes": grandchild_processes, "pids": config["pids"], "agent_id": str(config["agent_id"]), "last_master_contact": contacted, "last_announce": last_announce, "agent_lock_file": config["agent_lock_file"], "free_ram": 4242, "uptime": total_seconds( timedelta(seconds=future_time - config["start"])), "jobs": list(config["jobtypes"].keys())} request = self.get() status = Status() with nested( mock.patch.object(memory, "free_ram", return_value=4242), mock.patch.object(time, "time", return_value=future_time), mock.patch.object( memory, "process_memory", return_value=process_memory), mock.patch.object( memory, "total_consumption", return_value=total_consumption) ): response = status.render(request) self.assertEqual(response, NOT_DONE_YET) self.assertTrue(request.finished) self.assertEqual(request.responseCode, OK) self.assertEqual(len(request.written), 1) self.assertEqual(loads(request.written[0]), expected_data)
def should_reannounce(self): """Small method which acts as a trigger for :meth:`reannounce`""" if self.reannounce_lock.locked or self.shutting_down: return False contacted = config.master_contacted(update=False) if contacted is None: return True return utility.total_seconds( datetime.utcnow() - contacted) > config["agent_master_reannounce"]
def test_versions(self): request = self.get(headers={"User-Agent": config["master_user_agent"]}) versions = Versions() response = versions.render(request) self.assertEqual(response, NOT_DONE_YET) self.assertTrue(request.finished) self.assertEqual(request.responseCode, OK) self.assertEqual(len(request.written), 1) self.assertEqual(loads(request.written[0]), {"versions": [1]}) self.assertDateAlmostEqual( config.master_contacted(update=False), datetime.utcnow())
def callback_agent_id_set( self, change_type, key, new_value, old_value, shutdown_events=True): """ When `agent_id` is created we need to: * Register a shutdown event so that when the agent is told to shutdown it will notify the master of a state change. * Star the scheduled task manager """ if key == "agent_id" and change_type == config.CREATED \ and not self.register_shutdown_events: if shutdown_events: self.register_shutdown_events = True # set the initial free_ram config["free_ram"] = memory.free_ram() config.master_contacted() svclog.debug( "`%s` was %s, adding system event trigger for shutdown", key, change_type) self.repeating_call( config["agent_master_reannounce"], self.reannounce)
def get(self, **_): # Get counts for child processes and grandchild processes process = psutil.Process() direct_child_processes = len(process.children(recursive=False)) all_child_processes = len(process.children(recursive=True)) grandchild_processes = all_child_processes - direct_child_processes # Determine the last time we talked to the master (if ever) contacted = config.master_contacted(update=False) if isinstance(contacted, datetime): # pragma: no cover contacted = datetime.utcnow() - contacted # Determine the last time we announced ourselves to the # master (if ever) last_announce = config.get("last_announce", None) if isinstance(last_announce, datetime): # pragma: no cover last_announce = datetime.utcnow() - last_announce data = {"state": config["state"], "agent_hostname": config["agent_hostname"], "free_ram": memory.free_ram(), "agent_process_ram": memory.process_memory(), "consumed_ram": memory.total_consumption(), "child_processes": direct_child_processes, "grandchild_processes": grandchild_processes, "pids": config["pids"], "agent_id": config["agent_id"], "last_master_contact": contacted, "last_announce": last_announce, "agent_lock_file": config["agent_lock_file"], "uptime": total_seconds( timedelta(seconds=time.time() - config["start"])), "jobs": list(config["jobtypes"].keys())} if config["farm_name"]: data["farm_name"] = config["farm_name"] return dumps(data)
def post_agent_to_master(self): """ Runs the POST request to contact the master. Running this method multiple times should be considered safe but is generally something that should be avoided. """ url = self.agents_endpoint() data = self.system_data() try: response = yield post_direct(url, data=data) except Exception as failure: delay = http_retry_delay() if isinstance(failure, ConnectionRefusedError): svclog.error( "Failed to POST agent to master, the connection was " "refused. Retrying in %s seconds", delay) else: # pragma: no cover svclog.error( "Unhandled error when trying to POST the agent to the " "master. The error was %s.", failure) if not self.shutting_down: svclog.info( "Retrying failed POST to master in %s seconds.", delay) yield deferLater(reactor, delay, self.post_agent_to_master) else: svclog.warning("Not retrying POST to master, shutting down.") else: # Master might be down or have some other internal problems # that might eventually be fixed. Retry the request. if response.code >= INTERNAL_SERVER_ERROR: if not self.shutting_down: delay = http_retry_delay() svclog.warning( "Failed to post to master due to a server side error " "error %s, retrying in %s seconds", response.code, delay) yield deferLater(reactor, delay, self.post_agent_to_master) else: svclog.warning( "Failed to post to master due to a server side error " "error %s. Not retrying, because the agent is " "shutting down", response.code) # Master is up but is rejecting our request because there's # something wrong with it. Do not retry the request. elif response.code >= BAD_REQUEST: text = yield response.text() svclog.error( "%s accepted our POST request but responded with code %s " "which is a client side error. The message the server " "responded with was %r. Sorry, but we cannot retry this " "request as it's an issue with the agent's request.", url, response.code, text) else: data = yield treq.json_content(response) config["agent_id"] = data["id"] config.master_contacted() if response.code == OK: svclog.info( "POST to %s was successful. Agent %s was updated.", url, config["agent_id"]) elif response.code == CREATED: svclog.info( "POST to %s was successful. A new agent " "with an id of %s was created.", url, config["agent_id"]) returnValue(data)
def reannounce(self, force=False): """ Method which is used to periodically contact the master. This method is generally called as part of a scheduled task. """ # Attempt to acquire the reannounce lock but fail after 70% # of the total time between reannouncements elapses. This should # help prevent an accumulation of requests in the event the master # is having issues. try: yield self.reannounce_lock.acquire( config["agent_master_reannounce"] * .70 ) except utility.LockTimeoutError: svclog.debug("Timed out while waiting to acquire reannounce_lock") returnValue(None) if not self.should_reannounce() and not force: yield self.reannounce_lock.release() returnValue(None) svclog.debug("Announcing %s to master", config["agent_hostname"]) data = None num_retry_errors = 0 while True: # for retries try: response = yield post_direct( self.agent_api(), data={ "state": config["state"], "current_assignments": config.get( "current_assignments", {} # may not be set yet ), "free_ram": memory.free_ram(), "disks": disks.disks(as_dict=True) } ) except (ResponseNeverReceived, RequestTransmissionFailed) as error: num_retry_errors += 1 if num_retry_errors > config["broken_connection_max_retry"]: svclog.error( "Failed to announce self to the master, " "caught try-again type errors %s times in a row.", num_retry_errors) break else: svclog.debug("While announcing self to master, caught " "%s. Retrying immediately.", error.__class__.__name__) except Exception as error: if force: delay = http_retry_delay() svclog.error( "Failed to announce self to the master: %s. Will " "retry in %s seconds.", error, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: # Don't retry because reannounce is called periodically svclog.error( "Failed to announce self to the master: %s. This " "request will not be retried.", error) break else: data = yield treq.json_content(response) if response.code == OK: config.master_contacted(announcement=True) svclog.info("Announced self to the master server.") break elif response.code >= INTERNAL_SERVER_ERROR: if not self.shutting_down: delay = http_retry_delay() svclog.warning( "Could not announce self to the master server, " "internal server error: %s. Retrying in %s " "seconds.", data, delay) deferred = Deferred() reactor.callLater(delay, deferred.callback, None) yield deferred else: svclog.warning( "Could not announce to master. Not retrying " "because of pending shutdown.") break elif response.code == NOT_FOUND: svclog.warning("The master says it does not know about our " "agent id. Posting as a new agent.") yield self.post_agent_to_master() break # If this is a client problem retrying the request # is unlikely to fix the issue so we stop here elif response.code >= BAD_REQUEST: svclog.error( "Failed to announce self to the master, bad " "request: %s. This request will not be retried.", data) break else: svclog.error( "Unhandled error when posting self to the " "master: %s (code: %s). This request will not be " "retried.", data, response.code) break yield self.reannounce_lock.release() returnValue(data)
def get(self, **kwargs): """ Get the contents of the specified task log. The log will be returned in CSV format with the following fields: * ISO8601 timestamp * Stream number (0 == stdout, 1 == stderr) * Line number * Parent process ID * Message the process produced The log file identifier is configurable and relies on the `jobtype_task_log_filename` configuration option. See the configuration documentation for more information about the default value. .. http:get:: /api/v1/tasklogs/<identifier> HTTP/1.1 **Request** .. sourcecode:: http GET /api/v1/tasklogs/<identifier> HTTP/1.1 **Response** .. sourcecode:: http HTTP/1.1 200 OK Content-Type: text/csv 2015-05-07T23:42:53.730975,0,15,42,Hello world :statuscode 200: The log file was found, it's content will be returned. """ request = kwargs["request"] if request_from_master(request): config.master_contacted() if len(request.postpath) != 1: return ( dumps({"error": "Did not specify a log identifier"}), BAD_REQUEST, {"Content-Type": ["application/json"]} ) log_identifier = request.postpath[0] if "/" in log_identifier or "\\" in log_identifier: return ( dumps({"error": "log_identifier must not contain " "directory separators"}), BAD_REQUEST, {"Content-Type": ["application/json"]} ) path = join(config["jobtype_task_logs"], log_identifier) try: logfile = self._open_file(path, "rb") except Exception as error: request.responseHeaders.setRawHeaders( "Content-Type", ["application/json"]) if getattr(error, "errno", None) == ENOENT: return dumps({"error": "%s does not exist" % path}), NOT_FOUND logger.error("GET %s failed: %s", request.postpath, error) return dumps({"error": str(error)}), INTERNAL_SERVER_ERROR # TODO: deferToThread for open? (and possibly send) request.setResponseCode(OK) request.responseHeaders.setRawHeaders("Content-Type", ["text/csv"]) deferred = FileSender().beginFileTransfer(logfile, request) deferred.addCallback(lambda *_: request.finish()) deferred.addCallback(lambda *_: logfile.close()) return NOT_DONE_YET
def post(self, **kwargs): if request_from_master(kwargs["request"]): config.master_contacted() request = kwargs["request"] request_data = kwargs["data"] # First, get the resources we have *right now*. In some cases # this means using the functions in pyfarm.core.sysinfo because # entries in `config` could be slightly out of sync with the system. memory_free = free_ram() cpus = config["agent_cpus"] requires_ram = request_data["job"].get("ram") requires_cpus = request_data["job"].get("cpus") if ("agent_id" in request_data and request_data["agent_id"] != config["agent_id"]): logger.error("Wrong agent_id in assignment: %s. Our id is %s", request_data["agent_id"], config["agent_id"]) return ( dumps({"error": "You have the wrong agent. " "I am %s." % config["agent_id"], "agent_id": config["agent_id"]}), BAD_REQUEST ) elif self.agent.reannounce_lock.locked: logger.warning("Temporarily rejecting assignment because we " "are in the middle of a reannounce.") return ( dumps({"error": "Agent cannot accept assignments because of a " "reannounce in progress. Try again shortly."}), SERVICE_UNAVAILABLE ) elif self.agent.shutting_down: logger.error("Rejecting assignment because the agent is in the " "process of shutting down.") return ( dumps({"error": "Agent cannot accept assignments because it is " "shutting down."}), SERVICE_UNAVAILABLE ) elif "restart_requested" in config \ and config["restart_requested"] is True: logger.error("Rejecting assignment because of scheduled restart.") return ( dumps({"error": "Agent cannot accept assignments because of a " "pending restart."}), SERVICE_UNAVAILABLE ) elif "agent_id" not in config: logger.error( "Agent has not yet connected to the master or `agent_id` " "has not been set yet.") return ( dumps({"error": "agent_id has not been set in the config"}), SERVICE_UNAVAILABLE ) # Do we have enough ram? elif requires_ram is not None and requires_ram > memory_free: logger.error( "Task %s requires %sMB of ram, this agent has %sMB free. " "Rejecting Task %s.", request_data["job"]["id"], requires_ram, memory_free, request_data["job"]["id"]) config["free_ram"] = memory_free return ( dumps({"error": "Not enough ram", "agent_ram": memory_free, "requires_ram": requires_ram}), BAD_REQUEST ) # Do we have enough cpus (count wise)? elif requires_cpus is not None and requires_cpus > cpus: logger.error( "Task %s requires %s CPUs, this agent has %s CPUs. " "Rejecting Task %s.", request_data["job"]["id"], requires_cpus, cpus, request_data["job"]["id"]) return ( dumps({"error": "Not enough cpus", "agent_cpus": cpus, "requires_cpus": requires_cpus}), BAD_REQUEST ) new_task_ids = set(task["id"] for task in request_data["tasks"]) for assignment in config["current_assignments"].itervalues(): existing_task_ids = set(x["id"] for x in assignment["tasks"]) # If the assignment is identical to one we already have if existing_task_ids == new_task_ids: logger.debug( "Ignoring repeated assignment of the same batch") return dumps({"id": assignment["id"]}), ACCEPTED # If there is only a partial overlap elif existing_task_ids & new_task_ids: logger.error("Rejecting assignment with partial overlap with " "existing assignment.") unknown_task_ids = new_task_ids - existing_task_ids return ( dumps({"error": "Partial overlap of tasks", "rejected_task_ids": list(unknown_task_ids)}), CONFLICT ) if not config["agent_allow_sharing"]: for jobtype in config["jobtypes"].itervalues(): num_finished_tasks = (len(jobtype.finished_tasks) + len(jobtype.failed_tasks)) if len(jobtype.assignment["tasks"]) > num_finished_tasks: logger.error("Rejecting an assignment that would require " "agent sharing") return ( dumps({ "error": "Agent does not allow multiple " "assignments", "rejected_task_ids": list(new_task_ids)}), CONFLICT ) assignment_uuid = uuid4() request_data.update(id=assignment_uuid) config["current_assignments"][assignment_uuid] = request_data logger.debug("Accepted assignment %s: %r", assignment_uuid, request_data) logger.info("Accept assignment from job %s with %s tasks", request_data["job"]["title"], len(request_data["tasks"])) def assignment_failed(result, assign_id): logger.error( "Assignment %s failed, result: %r, removing.", assign_id, result) logger.error(result.getTraceback()) if (len(config["current_assignments"]) <= 1 and not self.agent.shutting_down): config["state"] = AgentState.ONLINE self.agent.reannounce(force=True) # Do not mark the assignment as failed if the reason for failing # was that we ran out of disk space failed = not isinstance(result.value, InsufficientSpaceError) assignment = config["current_assignments"].pop(assign_id) if "jobtype" in assignment: jobtype_id = assignment["jobtype"].pop("id", None) if jobtype_id: instance = config["jobtypes"].pop(jobtype_id, None) instance.stop( assignment_failed=failed, avoid_reassignment=not failed, error="Error in jobtype: %r. " "Traceback: %s" % (result, traceback.format_exc())) def assignment_started(_, assign_id): logger.debug("Assignment %s has started", assign_id) config["state"] = AgentState.RUNNING self.agent.reannounce(force=True) def remove_assignment(_, assign_id): assignment = config["current_assignments"].pop(assign_id) if "jobtype" in assignment: jobtype_id = assignment["jobtype"].pop("id", None) if jobtype_id: config["jobtypes"].pop(jobtype_id, None) def assignment_stopped(_, assign_id): logger.debug("Assignment %s has stopped", assign_id) if (len(config["current_assignments"]) <= 1 and not self.agent.shutting_down): config["state"] = AgentState.ONLINE self.agent.reannounce(force=True) assignment = config["current_assignments"][assign_id] if "jobtype" in assignment: jobtype_id = assignment["jobtype"].pop("id", None) if jobtype_id: jobtype = config["jobtypes"].pop(jobtype_id, None) updates_deferred = DeferredList( jobtype.task_update_deferreds) updates_deferred.addBoth(remove_assignment, assign_id) else: config["current_assignments"].pop(assign_id) def restart_if_necessary(_): # pragma: no cover if "restart_requested" in config and config["restart_requested"]: stopping = config["agent"].stop() stopping.addCallbacks(lambda _: reactor.stop(), lambda _: reactor.stop()) def load_jobtype_failed(result, assign_id): logger.error( "Loading jobtype for assignment %s failed, removing.", assign_id) traceback = result.getTraceback() logger.debug("Got traceback") logger.error(traceback) assignment = config["current_assignments"].pop(assign_id) # Mark all tasks as failed on master and set an error message logger.debug("Marking tasks in assignment as failed") def post_update(post_url, post_data, task, delay=0): post_func = partial(post, post_url, data=post_data, callback=lambda x: result_callback( post_url, post_data, task, x), errback=lambda x: error_callback( post_url, post_data, task, x)) reactor.callLater(delay, post_func) def result_callback(cburl, cbdata, task, response): if 500 <= response.code < 600: logger.error( "Error while marking task %s as failed on master, " "retrying", task["id"]) post_update(cburl, cbdata, task, delay=http_retry_delay()) elif response.code != OK: logger.error( "Could not mark task %s as failed, server response " "code was %s", task["id"], response.code) else: logger.info( "Marked task %s as failed on master", task["id"]) def error_callback(cburl, cbdata, task, failure_reason): logger.error( "Error while marking task %s as failed, retrying", task["id"], failure_reason) post_update(cburl, cbdata, task, delay=http_retry_delay()) for task in assignment["tasks"]: url = "%s/jobs/%s/tasks/%s" % ( config["master_api"], assignment["job"]["id"], task["id"]) data = { "state": WorkState.FAILED, "last_error": traceback} post_update(url, data, task) # If the loading was partially successful for some reason, there # might already be an entry for this jobtype in the config. # Remove it if it exists. if "jobtype" in assignment: jobtype_id = assignment["jobtype"].pop("id", None) if jobtype_id: config["jobtypes"].pop(jobtype_id, None) def loaded_jobtype(jobtype_class, assign_id): # TODO: report error to master if hasattr(jobtype_class, "getTraceback"): logger.error(jobtype_class.getTraceback()) return # TODO: add call to prepare_for_job # TODO: add call to spawn_persistent_process # Instance the job type and pass in the assignment data. instance = jobtype_class(request_data) if not isinstance(instance, JobType): raise TypeError( "Expected a subclass of " "pyfarm.jobtypes.core.jobtype.JobType") # TODO: add callback to cleanup_after_job # TODO: add callback to stop persistent process try: started_deferred, stopped_deferred = instance._start() started_deferred.addCallback(assignment_started, assign_id) started_deferred.addErrback(assignment_failed, assign_id) stopped_deferred.addCallback(assignment_stopped, assign_id) stopped_deferred.addErrback(assignment_failed, assign_id) stopped_deferred.addBoth(restart_if_necessary) stopped_deferred.addBoth( lambda *args: instance._remove_tempdirs()) stopped_deferred.addBoth( lambda *args: instance._close_logs()) stopped_deferred.addBoth( lambda *args: instance._upload_logfile()) except Exception as e: logger.error("Error on starting jobtype, stopping it now. " "Error was: %r. Traceback: %s", e, traceback.format_exc()) instance.stop(assignment_failed=True, error="Error while loading jobtype: %r. " "Traceback: %s" % (e, traceback.format_exc())) assignment = config["current_assignments"].pop(assign_id) if "jobtype" in assignment: jobtype_id = assignment["jobtype"].pop("id", None) if jobtype_id: config["jobtypes"].pop(jobtype_id, None) # Load the job type then pass the class along to the # callback. No errback here because all the errors # are handled internally in this case. jobtype_loader = JobType.load(request_data) jobtype_loader.addCallback(loaded_jobtype, assignment_uuid) jobtype_loader.addErrback(load_jobtype_failed, assignment_uuid) return dumps({"id": assignment_uuid}), ACCEPTED