def _handle_end(self, hostname, action, msg): # pylint: disable=unused-argument try: job_id = int(msg[2]) error_msg = msg[3] compressed_description = msg[4] except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return try: job = TestJob.objects.get(id=job_id) except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) # ACK even if the job is unknown to let the dispatcher # forget about it send_multipart_u(self.controler, [hostname, 'END_OK', str(job_id)]) return filename = os.path.join(job.output_dir, 'description.yaml') # If description.yaml already exists: a END was already received if os.path.exists(filename): self.logger.info("[%d] %s => END (duplicated), skipping", job_id, hostname) else: if compressed_description: self.logger.info("[%d] %s => END", job_id, hostname) else: self.logger.info( "[%d] %s => END (lava-run crashed, mark job as INCOMPLETE)", job_id, hostname) with transaction.atomic(): # TODO: find a way to lock actual_device job = TestJob.objects.select_for_update() \ .get(id=job_id) job.go_state_finished(TestJob.HEALTH_INCOMPLETE) if error_msg: self.logger.error("[%d] Error: %s", job_id, error_msg) job.failure_comment = error_msg job.save() # Create description.yaml even if it's empty # Allows to know when END messages are duplicated try: # Create the directory if it was not already created mkdir(os.path.dirname(filename)) # TODO: check that compressed_description is not "" description = lzma.decompress(compressed_description) with open(filename, 'w') as f_description: f_description.write(description.decode("utf-8")) if description: parse_job_description(job) except (OSError, lzma.LZMAError) as exc: self.logger.error("[%d] Unable to dump 'description.yaml'", job_id) self.logger.exception("[%d] %s", job_id, exc) # ACK the job and mark the dispatcher as alive send_multipart_u(self.controler, [hostname, 'END_OK', str(job_id)]) self.dispatcher_alive(hostname)
def _handle_end(self, hostname, action, msg): # pylint: disable=unused-argument try: job_id = int(msg[2]) error_msg = msg[3] compressed_description = msg[4] except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return try: job = TestJob.objects.get(id=job_id) except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) # ACK even if the job is unknown to let the dispatcher # forget about it send_multipart_u(self.controler, [hostname, 'END_OK', str(job_id)]) return filename = os.path.join(job.output_dir, 'description.yaml') # If description.yaml already exists: a END was already received if os.path.exists(filename): self.logger.info("[%d] %s => END (duplicated), skipping", job_id, hostname) else: if compressed_description: self.logger.info("[%d] %s => END", job_id, hostname) else: self.logger.info("[%d] %s => END (lava-run crashed, mark job as INCOMPLETE)", job_id, hostname) with transaction.atomic(): # TODO: find a way to lock actual_device job = TestJob.objects.select_for_update() \ .get(id=job_id) job.go_state_finished(TestJob.HEALTH_INCOMPLETE) if error_msg: self.logger.error("[%d] Error: %s", job_id, error_msg) job.failure_comment = error_msg job.save() # Create description.yaml even if it's empty # Allows to know when END messages are duplicated try: # Create the directory if it was not already created mkdir(os.path.dirname(filename)) # TODO: check that compressed_description is not "" description = lzma.decompress(compressed_description) with open(filename, 'w') as f_description: f_description.write(description.decode("utf-8")) if description: parse_job_description(job) except (IOError, lzma.LZMAError) as exc: self.logger.error("[%d] Unable to dump 'description.yaml'", job_id) self.logger.exception("[%d] %s", job_id, exc) # ACK the job and mark the dispatcher as alive send_multipart_u(self.controler, [hostname, 'END_OK', str(job_id)]) self.dispatcher_alive(hostname)
def controler_socket(self): msg = self.controler.recv_multipart() # This is way to verbose for production and should only be activated # by (and for) developers # self.logger.debug("[CC] Receiving: %s", msg) # 1: the hostname (see ZMQ documentation) hostname = msg[0] # 2: the action action = msg[1] # Handle the actions if action == 'HELLO' or action == 'HELLO_RETRY': self.logger.info("%s => %s", hostname, action) # Check the protocol version try: slave_version = int(msg[2]) except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False if slave_version != PROTOCOL_VERSION: self.logger.error( "<%s> using protocol v%d while master is using v%d", hostname, slave_version, PROTOCOL_VERSION) return False self.controler.send_multipart([hostname, 'HELLO_OK']) # If the dispatcher is known and sent an HELLO, means that # the slave has restarted if hostname in self.dispatchers: if action == 'HELLO': self.logger.warning("Dispatcher <%s> has RESTARTED", hostname) else: # Assume the HELLO command was received, and the # action succeeded. self.logger.warning("Dispatcher <%s> was not confirmed", hostname) else: # No dispatcher, treat HELLO and HELLO_RETRY as a normal HELLO # message. self.logger.warning("New dispatcher <%s>", hostname) self.dispatchers[hostname] = SlaveDispatcher(hostname, online=True) if action == 'HELLO': # FIXME: slaves need to be allowed to restart cleanly without affecting jobs # as well as handling unexpected crashes. self._cancel_slave_dispatcher_jobs(hostname) # Mark the dispatcher as alive self.dispatchers[hostname].alive() elif action == 'PING': self.logger.debug("%s => PING", hostname) # Send back a signal self.controler.send_multipart([hostname, 'PONG']) self.dispatcher_alive(hostname) elif action == 'END': try: job_id = int(msg[2]) job_status = int(msg[3]) error_msg = msg[4] description = msg[5] except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False if job_status: status = TestJob.INCOMPLETE self.logger.info("[%d] %s => END with error %d", job_id, hostname, job_status) self.logger.error("[%d] Error: %s", job_id, error_msg) else: status = TestJob.COMPLETE self.logger.info("[%d] %s => END", job_id, hostname) # Find the corresponding job and update the status try: # Save the description job = TestJob.objects.get(id=job_id) filename = os.path.join(job.output_dir, 'description.yaml') try: with open(filename, 'w') as f_description: f_description.write(lzma.decompress(description)) except (IOError, lzma.error) as exc: self.logger.error("[%d] Unable to dump 'description.yaml'", job_id) self.logger.exception(exc) parse_job_description(job) # Update status. with transaction.atomic(): job = TestJob.objects.select_for_update().get(id=job_id) if job.status == TestJob.CANCELING: cancel_job(job) fail_job(job, fail_msg=error_msg, job_status=status) except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) # ACK even if the job is unknown to let the dispatcher # forget about it self.controler.send_multipart([hostname, 'END_OK', str(job_id)]) self.dispatcher_alive(hostname) elif action == 'START_OK': try: job_id = int(msg[2]) except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False self.logger.info("[%d] %s => START_OK", job_id, hostname) try: with transaction.atomic(): job = TestJob.objects.select_for_update() \ .get(id=job_id) start_job(job) except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) self.dispatcher_alive(hostname) else: self.logger.error("<%s> sent unknown action=%s, args=(%s)", hostname, action, msg[1:]) return True
def controler_socket(self): msg = self.controler.recv_multipart() # This is way to verbose for production and should only be activated # by (and for) developers # self.logger.debug("[CC] Receiving: %s", msg) # 1: the hostname (see ZMQ documentation) hostname = msg[0] # 2: the action action = msg[1] # Handle the actions if action == 'HELLO' or action == 'HELLO_RETRY': self.logger.info("%s => %s", hostname, action) # Check the protocol version try: slave_version = int(msg[2]) except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False if slave_version != PROTOCOL_VERSION: self.logger.error("<%s> using protocol v%d while master is using v%d", hostname, slave_version, PROTOCOL_VERSION) return False self.controler.send_multipart([hostname, 'HELLO_OK']) # If the dispatcher is known and sent an HELLO, means that # the slave has restarted if hostname in self.dispatchers: if action == 'HELLO': self.logger.warning("Dispatcher <%s> has RESTARTED", hostname) else: # Assume the HELLO command was received, and the # action succeeded. self.logger.warning("Dispatcher <%s> was not confirmed", hostname) else: # No dispatcher, treat HELLO and HELLO_RETRY as a normal HELLO # message. self.logger.warning("New dispatcher <%s>", hostname) self.dispatchers[hostname] = SlaveDispatcher(hostname, online=True) if action == 'HELLO': # FIXME: slaves need to be allowed to restart cleanly without affecting jobs # as well as handling unexpected crashes. self._cancel_slave_dispatcher_jobs(hostname) # Mark the dispatcher as alive self.dispatchers[hostname].alive() elif action == 'PING': self.logger.debug("%s => PING", hostname) # Send back a signal self.controler.send_multipart([hostname, 'PONG']) self.dispatcher_alive(hostname) elif action == 'END': try: job_id = int(msg[2]) job_status = int(msg[3]) error_msg = msg[4] description = msg[5] except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False if job_status: status = TestJob.INCOMPLETE self.logger.info("[%d] %s => END with error %d", job_id, hostname, job_status) self.logger.error("[%d] Error: %s", job_id, error_msg) else: status = TestJob.COMPLETE self.logger.info("[%d] %s => END", job_id, hostname) # Find the corresponding job and update the status try: with transaction.atomic(): job = TestJob.objects.select_for_update().get(id=job_id) if job.status == TestJob.CANCELING: cancel_job(job) fail_job(job, fail_msg=error_msg, job_status=status) # Save the description filename = os.path.join(job.output_dir, 'description.yaml') try: with open(filename, 'w') as f_description: f_description.write(lzma.decompress(description)) except (IOError, lzma.error) as exc: self.logger.error("[%d] Unable to dump 'description.yaml'", job_id) self.logger.exception(exc) parse_job_description(job) except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) # ACK even if the job is unknown to let the dispatcher # forget about it self.controler.send_multipart([hostname, 'END_OK', str(job_id)]) self.dispatcher_alive(hostname) elif action == 'START_OK': try: job_id = int(msg[2]) except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False self.logger.info("[%d] %s => START_OK", job_id, hostname) try: with transaction.atomic(): job = TestJob.objects.select_for_update() \ .get(id=job_id) start_job(job) except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) self.dispatcher_alive(hostname) else: self.logger.error("<%s> sent unknown action=%s, args=(%s)", hostname, action, msg[1:]) return True