def controler_socket(self): msg = self.controler.recv_multipart() # This is way to verbose for production and should only be activated # by (and for) developers # self.logger.debug("[CC] Receiving: %s", msg) # 1: the hostname (see ZMQ documentation) hostname = msg[0] # 2: the action action = msg[1] # Handle the actions if action == 'HELLO' or action == 'HELLO_RETRY': self.logger.info("%s => %s", hostname, action) # Check the protocol version try: slave_version = int(msg[2]) except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False if slave_version != PROTOCOL_VERSION: self.logger.error("<%s> using protocol v%d while master is using v%d", hostname, slave_version, PROTOCOL_VERSION) return False self.controler.send_multipart([hostname, 'HELLO_OK']) # If the dispatcher is known and sent an HELLO, means that # the slave has restarted if hostname in self.dispatchers: if action == 'HELLO': self.logger.warning("Dispatcher <%s> has RESTARTED", hostname) else: # Assume the HELLO command was received, and the # action succeeded. self.logger.warning("Dispatcher <%s> was not confirmed", hostname) else: # No dispatcher, treat HELLO and HELLO_RETRY as a normal HELLO # message. self.logger.warning("New dispatcher <%s>", hostname) self.dispatchers[hostname] = SlaveDispatcher(hostname, online=True) if action == 'HELLO': # FIXME: slaves need to be allowed to restart cleanly without affecting jobs # as well as handling unexpected crashes. self._cancel_slave_dispatcher_jobs(hostname) # Mark the dispatcher as alive self.dispatchers[hostname].alive() elif action == 'PING': self.logger.debug("%s => PING", hostname) # Send back a signal self.controler.send_multipart([hostname, 'PONG']) self.dispatcher_alive(hostname) elif action == 'END': try: job_id = int(msg[2]) job_status = int(msg[3]) error_msg = msg[4] description = msg[5] except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False if job_status: status = TestJob.INCOMPLETE self.logger.info("[%d] %s => END with error %d", job_id, hostname, job_status) self.logger.error("[%d] Error: %s", job_id, error_msg) else: status = TestJob.COMPLETE self.logger.info("[%d] %s => END", job_id, hostname) # Find the corresponding job and update the status try: with transaction.atomic(): job = TestJob.objects.select_for_update().get(id=job_id) if job.status == TestJob.CANCELING: cancel_job(job) fail_job(job, fail_msg=error_msg, job_status=status) # Save the description filename = os.path.join(job.output_dir, 'description.yaml') try: with open(filename, 'w') as f_description: f_description.write(lzma.decompress(description)) except (IOError, lzma.error) as exc: self.logger.error("[%d] Unable to dump 'description.yaml'", job_id) self.logger.exception(exc) parse_job_description(job) except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) # ACK even if the job is unknown to let the dispatcher # forget about it self.controler.send_multipart([hostname, 'END_OK', str(job_id)]) self.dispatcher_alive(hostname) elif action == 'START_OK': try: job_id = int(msg[2]) except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False self.logger.info("[%d] %s => START_OK", job_id, hostname) try: with transaction.atomic(): job = TestJob.objects.select_for_update() \ .get(id=job_id) start_job(job) except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) self.dispatcher_alive(hostname) else: self.logger.error("<%s> sent unknown action=%s, args=(%s)", hostname, action, msg[1:]) return True
def controler_socket(self): msg = self.controler.recv_multipart() # This is way to verbose for production and should only be activated # by (and for) developers # self.logger.debug("[CC] Receiving: %s", msg) # 1: the hostname (see ZMQ documentation) hostname = msg[0] # 2: the action action = msg[1] # Handle the actions if action == 'HELLO' or action == 'HELLO_RETRY': self.logger.info("%s => %s", hostname, action) # Check the protocol version try: slave_version = int(msg[2]) except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False if slave_version != PROTOCOL_VERSION: self.logger.error( "<%s> using protocol v%d while master is using v%d", hostname, slave_version, PROTOCOL_VERSION) return False self.controler.send_multipart([hostname, 'HELLO_OK']) # If the dispatcher is known and sent an HELLO, means that # the slave has restarted if hostname in self.dispatchers: if action == 'HELLO': self.logger.warning("Dispatcher <%s> has RESTARTED", hostname) else: # Assume the HELLO command was received, and the # action succeeded. self.logger.warning("Dispatcher <%s> was not confirmed", hostname) else: # No dispatcher, treat HELLO and HELLO_RETRY as a normal HELLO # message. self.logger.warning("New dispatcher <%s>", hostname) self.dispatchers[hostname] = SlaveDispatcher(hostname, online=True) if action == 'HELLO': # FIXME: slaves need to be allowed to restart cleanly without affecting jobs # as well as handling unexpected crashes. self._cancel_slave_dispatcher_jobs(hostname) # Mark the dispatcher as alive self.dispatchers[hostname].alive() elif action == 'PING': self.logger.debug("%s => PING", hostname) # Send back a signal self.controler.send_multipart([hostname, 'PONG']) self.dispatcher_alive(hostname) elif action == 'END': try: job_id = int(msg[2]) job_status = int(msg[3]) error_msg = msg[4] description = msg[5] except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False if job_status: status = TestJob.INCOMPLETE self.logger.info("[%d] %s => END with error %d", job_id, hostname, job_status) self.logger.error("[%d] Error: %s", job_id, error_msg) else: status = TestJob.COMPLETE self.logger.info("[%d] %s => END", job_id, hostname) # Find the corresponding job and update the status try: # Save the description job = TestJob.objects.get(id=job_id) filename = os.path.join(job.output_dir, 'description.yaml') try: with open(filename, 'w') as f_description: f_description.write(lzma.decompress(description)) except (IOError, lzma.error) as exc: self.logger.error("[%d] Unable to dump 'description.yaml'", job_id) self.logger.exception(exc) parse_job_description(job) # Update status. with transaction.atomic(): job = TestJob.objects.select_for_update().get(id=job_id) if job.status == TestJob.CANCELING: cancel_job(job) fail_job(job, fail_msg=error_msg, job_status=status) except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) # ACK even if the job is unknown to let the dispatcher # forget about it self.controler.send_multipart([hostname, 'END_OK', str(job_id)]) self.dispatcher_alive(hostname) elif action == 'START_OK': try: job_id = int(msg[2]) except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False self.logger.info("[%d] %s => START_OK", job_id, hostname) try: with transaction.atomic(): job = TestJob.objects.select_for_update() \ .get(id=job_id) start_job(job) except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) self.dispatcher_alive(hostname) else: self.logger.error("<%s> sent unknown action=%s, args=(%s)", hostname, action, msg[1:]) return True
def controler_socket(self): msg = self.controler.recv_multipart() self.logger.debug("[CC] Receiving: %s", msg) # 1: the hostname (see ZMQ documentation) hostname = msg[0] # 2: the action action = msg[1] # Handle the actions if action == 'HELLO': self.logger.info("%s => %s", hostname, action) self.controler.send_multipart([hostname, 'HELLO_OK']) # If the dispatcher is known and sent an HELLO, means that # the slave has restarted if hostname in self.dispatchers: self.logger.warning("Dispatcher <%s> has RESTARTED", hostname) else: self.logger.warning("New dispatcher <%s>", hostname) self.dispatchers[hostname] = SlaveDispatcher(hostname, online=True) # FIXME: slaves need to be allowed to restart cleanly without affecting jobs # as well as handling unexpected crashes. self._cancel_slave_dispatcher_jobs(hostname) # Mark the dispatcher as alive self.dispatchers[hostname].alive() elif action == "HELLO_RETRY": self.logger.info("%s => HELLO_RETRY", hostname) self.controler.send_multipart([hostname, "HELLO_OK"]) if hostname in self.dispatchers: # Assume the HELLO command was received, and the # action succeeded. self.logger.warning( "Dispatcher <%s> was not confirmed", hostname) else: # No dispatcher, treat it as a normal HELLO message. self.logger.warning("New dispatcher <%s>", hostname) self.dispatchers[hostname] = SlaveDispatcher( hostname, online=True) self._cancel_slave_dispatcher_jobs(hostname) # Mark the dispatcher as alive self.dispatchers[hostname].alive() elif action == 'PING': self.logger.debug("%s => PING", hostname) # Send back a signal self.controler.send_multipart([hostname, 'PONG']) if hostname not in self.dispatchers: # The server crashed: send a STATUS message self.logger.warning("Unknown dispatcher <%s> (server crashed)", hostname) self.dispatchers[hostname] = SlaveDispatcher(hostname, online=True) send_status(hostname, self.controler, self.logger) # Mark the dispatcher as alive self.dispatchers[hostname].alive() elif action == "ERROR": try: job_id = int(msg[2]) error_msg = str(msg[3]) except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg[:50]) return False self.logger.error("[%d] Error: %s", job_id, error_msg) # Mark the dispatcher as alive self.dispatchers[hostname].alive() elif action == 'END': status = TestJob.COMPLETE try: job_id = int(msg[2]) job_status = int(msg[3]) except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False if job_status: self.logger.info("[%d] %s => END with error %d", job_id, hostname, job_status) status = TestJob.INCOMPLETE else: self.logger.info("[%d] %s => END", job_id, hostname) try: with transaction.atomic(): job = TestJob.objects.select_for_update() \ .get(id=job_id) if job.status == TestJob.CANCELING: cancel_job(job) else: end_job(job, job_status=status) except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) # ACK even if the job is unknown to let the dispatcher # forget about it self.controler.send_multipart([hostname, 'END_OK', str(job_id)]) if hostname not in self.dispatchers: # The server crashed: send a STATUS message self.logger.warning("Unknown dispatcher <%s> (server crashed)", hostname) self.dispatchers[hostname] = SlaveDispatcher(hostname, online=True) send_status(hostname, self.controler, self.logger) # Mark the dispatcher as alive self.dispatchers[hostname].alive() elif action == 'START_OK': try: job_id = int(msg[2]) except (IndexError, ValueError): self.logger.error("Invalid message from <%s> '%s'", hostname, msg) return False self.logger.info("[%d] %s => START_OK", job_id, hostname) try: with transaction.atomic(): job = TestJob.objects.select_for_update() \ .get(id=job_id) start_job(job) except TestJob.DoesNotExist: self.logger.error("[%d] Unknown job", job_id) if hostname not in self.dispatchers: # The server crashed: send a STATUS message self.logger.warning("Unknown dispatcher <%s> (server crashed)", hostname) self.dispatchers[hostname] = SlaveDispatcher(hostname, online=True) send_status(hostname, self.controler, self.logger) # Mark the dispatcher as alive self.dispatchers[hostname].alive() else: self.logger.error("<%s> sent unknown action=%s, args=(%s)", hostname, action, msg[1:]) return True