def test_capure_stderr(self): config["jobtype_capture_process_output"] = True jobtype = JobType(fake_assignment()) with patch.object(process_stderr, "info") as mocked: jobtype.log_stderr_line(Mock(id=2), "stderr") mocked.assert_called_once_with("task %r: %s", 2, "stderr")
def test_none(self): jobtype = JobType(fake_assignment()) with patch.object(logger, "error") as mocked_error: self.assertIsNone(jobtype.format_error(None)) mocked_error.assert_called_with( "No error was defined for this failure.")
def test_process_stderr_line_calls_stdout_line_processing(self): jobtype = JobType(fake_assignment()) logpool.open_log(jobtype.uuid, self.create_file(), ignore_existing=True) protocol = Mock(id=4) with patch.object(jobtype, "process_stdout_line") as process_mock: jobtype.process_stderr_line(protocol, "stderr 4") process_mock.assert_called_with(protocol, "stderr 4")
def test_no_capure_stderr(self): config["jobtype_capture_process_output"] = False protocol = Mock(id=3, pid=33) jobtype = JobType(fake_assignment()) logpool.open_log(jobtype.uuid, self.create_file(), ignore_existing=True) with patch.object(logpool, "log") as mocked: jobtype.log_stderr_line(protocol, "stderr") mocked.assert_called_once_with(jobtype.uuid, STDERR, "stderr", 33)
def test_process_done(self): error = Failure(exc_value=Exception()) error.type = ProcessDone error.value = Mock(exitCode=1, message="foobar") jobtype = JobType(fake_assignment()) self.assertEqual( jobtype.format_error(error), "Process has finished with no apparent errors." )
def test_preprocess_can_stop_handling(self): jobtype = JobType(fake_assignment()) protocol = Mock(id=1) with nested( patch.object(jobtype, "preprocess_stderr_line", return_value=False), patch.object(jobtype, "format_stderr_line"), ) as (_, mocked_format): jobtype.handle_stderr_line(protocol, "stderr 1") self.assertEqual(mocked_format.call_count, 0)
def test_preprocess_replaces_output(self): jobtype = JobType(fake_assignment()) logpool.open_log(jobtype.uuid, self.create_file(), ignore_existing=True) protocol = Mock(id=2) with nested( patch.object(jobtype, "preprocess_stderr_line", return_value="foo"), patch.object(jobtype, "format_stderr_line"), ) as (_, mocked): jobtype.handle_stderr_line(protocol, "stderr 2") mocked.assert_called_with(protocol, "foo")
def test_process_terminated(self): error = Failure(exc_value=Exception()) error.type = ProcessTerminated error.value = Mock(exitCode=1, message="foobar") jobtype = JobType(fake_assignment()) self.assertEqual( jobtype.format_error(error), "Process may have terminated abnormally, please check " "the logs. Message from error " "was 'foobar'" )
def test_format_replaces_output(self): jobtype = JobType(fake_assignment()) logpool.open_log(jobtype.uuid, self.create_file(), ignore_existing=True) protocol = Mock(id=3) with nested( patch.object(jobtype, "format_stdout_line", return_value="bar"), patch.object(jobtype, "log_stdout_line"), patch.object(jobtype, "process_stdout_line"), ) as (_, log_mock, process_mock): jobtype.handle_stdout_line(protocol, "stdout 3") log_mock.assert_called_with(protocol, "bar") process_mock.assert_called_with(protocol, "bar")
def test_other_conversion_problem(self): jobtype = JobType(fake_assignment()) class OtherType(object): def __str__(self): raise NotImplementedError("__str__ not implemented") other = OtherType() try: str(other) except NotImplementedError as str_error: pass with patch.object(logger, "error") as mocked_error: self.assertIsNone(jobtype.format_error(other)) mocked_error.assert_called_with( "Don't know how to format %r as a string. Error while calling " "str(%r) was %s.", other, str(str_error))
def test_other(self): jobtype = JobType(fake_assignment()) self.assertEqual(jobtype.format_error(42), str(42))
def test_string(self): jobtype = JobType(fake_assignment()) self.assertEqual( jobtype.format_error("This is a string"), "This is a string" )
def test_exception(self): jobtype = JobType(fake_assignment()) self.assertEqual( jobtype.format_error(TypeError("foobar 2")), "foobar 2" )
def test_schema(self): with self.assertRaises(MultipleInvalid): JobType.load({})
def post(self, **kwargs): if request_from_master(kwargs["request"]): config.master_contacted() request = kwargs["request"] request_data = kwargs["data"] # First, get the resources we have *right now*. In some cases # this means using the functions in pyfarm.core.sysinfo because # entries in `config` could be slightly out of sync with the system. memory_free = free_ram() cpus = config["agent_cpus"] requires_ram = request_data["job"].get("ram") requires_cpus = request_data["job"].get("cpus") if ("agent_id" in request_data and request_data["agent_id"] != config["agent_id"]): logger.error("Wrong agent_id in assignment: %s. Our id is %s", request_data["agent_id"], config["agent_id"]) return ( dumps({"error": "You have the wrong agent. " "I am %s." % config["agent_id"], "agent_id": config["agent_id"]}), BAD_REQUEST ) elif self.agent.reannounce_lock.locked: logger.warning("Temporarily rejecting assignment because we " "are in the middle of a reannounce.") return ( dumps({"error": "Agent cannot accept assignments because of a " "reannounce in progress. Try again shortly."}), SERVICE_UNAVAILABLE ) elif self.agent.shutting_down: logger.error("Rejecting assignment because the agent is in the " "process of shutting down.") return ( dumps({"error": "Agent cannot accept assignments because it is " "shutting down."}), SERVICE_UNAVAILABLE ) elif "restart_requested" in config \ and config["restart_requested"] is True: logger.error("Rejecting assignment because of scheduled restart.") return ( dumps({"error": "Agent cannot accept assignments because of a " "pending restart."}), SERVICE_UNAVAILABLE ) elif "agent_id" not in config: logger.error( "Agent has not yet connected to the master or `agent_id` " "has not been set yet.") return ( dumps({"error": "agent_id has not been set in the config"}), SERVICE_UNAVAILABLE ) # Do we have enough ram? elif requires_ram is not None and requires_ram > memory_free: logger.error( "Task %s requires %sMB of ram, this agent has %sMB free. " "Rejecting Task %s.", request_data["job"]["id"], requires_ram, memory_free, request_data["job"]["id"]) config["free_ram"] = memory_free return ( dumps({"error": "Not enough ram", "agent_ram": memory_free, "requires_ram": requires_ram}), BAD_REQUEST ) # Do we have enough cpus (count wise)? elif requires_cpus is not None and requires_cpus > cpus: logger.error( "Task %s requires %s CPUs, this agent has %s CPUs. " "Rejecting Task %s.", request_data["job"]["id"], requires_cpus, cpus, request_data["job"]["id"]) return ( dumps({"error": "Not enough cpus", "agent_cpus": cpus, "requires_cpus": requires_cpus}), BAD_REQUEST ) new_task_ids = set(task["id"] for task in request_data["tasks"]) for assignment in config["current_assignments"].itervalues(): existing_task_ids = set(x["id"] for x in assignment["tasks"]) # If the assignment is identical to one we already have if existing_task_ids == new_task_ids: logger.debug( "Ignoring repeated assignment of the same batch") return dumps({"id": assignment["id"]}), ACCEPTED # If there is only a partial overlap elif existing_task_ids & new_task_ids: logger.error("Rejecting assignment with partial overlap with " "existing assignment.") unknown_task_ids = new_task_ids - existing_task_ids return ( dumps({"error": "Partial overlap of tasks", "rejected_task_ids": list(unknown_task_ids)}), CONFLICT ) if not config["agent_allow_sharing"]: for jobtype in config["jobtypes"].itervalues(): num_finished_tasks = (len(jobtype.finished_tasks) + len(jobtype.failed_tasks)) if len(jobtype.assignment["tasks"]) > num_finished_tasks: logger.error("Rejecting an assignment that would require " "agent sharing") return ( dumps({ "error": "Agent does not allow multiple " "assignments", "rejected_task_ids": list(new_task_ids)}), CONFLICT ) assignment_uuid = uuid4() request_data.update(id=assignment_uuid) config["current_assignments"][assignment_uuid] = request_data logger.debug("Accepted assignment %s: %r", assignment_uuid, request_data) logger.info("Accept assignment from job %s with %s tasks", request_data["job"]["title"], len(request_data["tasks"])) def assignment_failed(result, assign_id): logger.error( "Assignment %s failed, result: %r, removing.", assign_id, result) logger.error(result.getTraceback()) if (len(config["current_assignments"]) <= 1 and not self.agent.shutting_down): config["state"] = AgentState.ONLINE self.agent.reannounce(force=True) # Do not mark the assignment as failed if the reason for failing # was that we ran out of disk space failed = not isinstance(result.value, InsufficientSpaceError) assignment = config["current_assignments"].pop(assign_id) if "jobtype" in assignment: jobtype_id = assignment["jobtype"].pop("id", None) if jobtype_id: instance = config["jobtypes"].pop(jobtype_id, None) instance.stop( assignment_failed=failed, avoid_reassignment=not failed, error="Error in jobtype: %r. " "Traceback: %s" % (result, traceback.format_exc())) def assignment_started(_, assign_id): logger.debug("Assignment %s has started", assign_id) config["state"] = AgentState.RUNNING self.agent.reannounce(force=True) def remove_assignment(_, assign_id): assignment = config["current_assignments"].pop(assign_id) if "jobtype" in assignment: jobtype_id = assignment["jobtype"].pop("id", None) if jobtype_id: config["jobtypes"].pop(jobtype_id, None) def assignment_stopped(_, assign_id): logger.debug("Assignment %s has stopped", assign_id) if (len(config["current_assignments"]) <= 1 and not self.agent.shutting_down): config["state"] = AgentState.ONLINE self.agent.reannounce(force=True) assignment = config["current_assignments"][assign_id] if "jobtype" in assignment: jobtype_id = assignment["jobtype"].pop("id", None) if jobtype_id: jobtype = config["jobtypes"].pop(jobtype_id, None) updates_deferred = DeferredList( jobtype.task_update_deferreds) updates_deferred.addBoth(remove_assignment, assign_id) else: config["current_assignments"].pop(assign_id) def restart_if_necessary(_): # pragma: no cover if "restart_requested" in config and config["restart_requested"]: stopping = config["agent"].stop() stopping.addCallbacks(lambda _: reactor.stop(), lambda _: reactor.stop()) def load_jobtype_failed(result, assign_id): logger.error( "Loading jobtype for assignment %s failed, removing.", assign_id) traceback = result.getTraceback() logger.debug("Got traceback") logger.error(traceback) assignment = config["current_assignments"].pop(assign_id) # Mark all tasks as failed on master and set an error message logger.debug("Marking tasks in assignment as failed") def post_update(post_url, post_data, task, delay=0): post_func = partial(post, post_url, data=post_data, callback=lambda x: result_callback( post_url, post_data, task, x), errback=lambda x: error_callback( post_url, post_data, task, x)) reactor.callLater(delay, post_func) def result_callback(cburl, cbdata, task, response): if 500 <= response.code < 600: logger.error( "Error while marking task %s as failed on master, " "retrying", task["id"]) post_update(cburl, cbdata, task, delay=http_retry_delay()) elif response.code != OK: logger.error( "Could not mark task %s as failed, server response " "code was %s", task["id"], response.code) else: logger.info( "Marked task %s as failed on master", task["id"]) def error_callback(cburl, cbdata, task, failure_reason): logger.error( "Error while marking task %s as failed, retrying", task["id"], failure_reason) post_update(cburl, cbdata, task, delay=http_retry_delay()) for task in assignment["tasks"]: url = "%s/jobs/%s/tasks/%s" % ( config["master_api"], assignment["job"]["id"], task["id"]) data = { "state": WorkState.FAILED, "last_error": traceback} post_update(url, data, task) # If the loading was partially successful for some reason, there # might already be an entry for this jobtype in the config. # Remove it if it exists. if "jobtype" in assignment: jobtype_id = assignment["jobtype"].pop("id", None) if jobtype_id: config["jobtypes"].pop(jobtype_id, None) def loaded_jobtype(jobtype_class, assign_id): # TODO: report error to master if hasattr(jobtype_class, "getTraceback"): logger.error(jobtype_class.getTraceback()) return # TODO: add call to prepare_for_job # TODO: add call to spawn_persistent_process # Instance the job type and pass in the assignment data. instance = jobtype_class(request_data) if not isinstance(instance, JobType): raise TypeError( "Expected a subclass of " "pyfarm.jobtypes.core.jobtype.JobType") # TODO: add callback to cleanup_after_job # TODO: add callback to stop persistent process try: started_deferred, stopped_deferred = instance._start() started_deferred.addCallback(assignment_started, assign_id) started_deferred.addErrback(assignment_failed, assign_id) stopped_deferred.addCallback(assignment_stopped, assign_id) stopped_deferred.addErrback(assignment_failed, assign_id) stopped_deferred.addBoth(restart_if_necessary) stopped_deferred.addBoth( lambda *args: instance._remove_tempdirs()) stopped_deferred.addBoth( lambda *args: instance._close_logs()) stopped_deferred.addBoth( lambda *args: instance._upload_logfile()) except Exception as e: logger.error("Error on starting jobtype, stopping it now. " "Error was: %r. Traceback: %s", e, traceback.format_exc()) instance.stop(assignment_failed=True, error="Error while loading jobtype: %r. " "Traceback: %s" % (e, traceback.format_exc())) assignment = config["current_assignments"].pop(assign_id) if "jobtype" in assignment: jobtype_id = assignment["jobtype"].pop("id", None) if jobtype_id: config["jobtypes"].pop(jobtype_id, None) # Load the job type then pass the class along to the # callback. No errback here because all the errors # are handled internally in this case. jobtype_loader = JobType.load(request_data) jobtype_loader.addCallback(loaded_jobtype, assignment_uuid) jobtype_loader.addErrback(load_jobtype_failed, assignment_uuid) return dumps({"id": assignment_uuid}), ACCEPTED