Example #1
0
    def _handle_end(self, hostname, action, msg):  # pylint: disable=unused-argument
        try:
            job_id = int(msg[2])
            error_msg = msg[3]
            compressed_description = msg[4]
        except (IndexError, ValueError):
            self.logger.error("Invalid message from <%s> '%s'", hostname, msg)
            return

        try:
            job = TestJob.objects.get(id=job_id)
        except TestJob.DoesNotExist:
            self.logger.error("[%d] Unknown job", job_id)
            # ACK even if the job is unknown to let the dispatcher
            # forget about it
            send_multipart_u(self.controler, [hostname, 'END_OK', str(job_id)])
            return

        filename = os.path.join(job.output_dir, 'description.yaml')
        # If description.yaml already exists: a END was already received
        if os.path.exists(filename):
            self.logger.info("[%d] %s => END (duplicated), skipping", job_id,
                             hostname)
        else:
            if compressed_description:
                self.logger.info("[%d] %s => END", job_id, hostname)
            else:
                self.logger.info(
                    "[%d] %s => END (lava-run crashed, mark job as INCOMPLETE)",
                    job_id, hostname)
                with transaction.atomic():
                    # TODO: find a way to lock actual_device
                    job = TestJob.objects.select_for_update() \
                                         .get(id=job_id)

                    job.go_state_finished(TestJob.HEALTH_INCOMPLETE)
                    if error_msg:
                        self.logger.error("[%d] Error: %s", job_id, error_msg)
                        job.failure_comment = error_msg
                    job.save()

            # Create description.yaml even if it's empty
            # Allows to know when END messages are duplicated
            try:
                # Create the directory if it was not already created
                mkdir(os.path.dirname(filename))
                # TODO: check that compressed_description is not ""
                description = lzma.decompress(compressed_description)
                with open(filename, 'w') as f_description:
                    f_description.write(description.decode("utf-8"))
                if description:
                    parse_job_description(job)
            except (OSError, lzma.LZMAError) as exc:
                self.logger.error("[%d] Unable to dump 'description.yaml'",
                                  job_id)
                self.logger.exception("[%d] %s", job_id, exc)

        # ACK the job and mark the dispatcher as alive
        send_multipart_u(self.controler, [hostname, 'END_OK', str(job_id)])
        self.dispatcher_alive(hostname)
Example #2
0
    def _handle_end(self, hostname, action, msg):  # pylint: disable=unused-argument
        try:
            job_id = int(msg[2])
            error_msg = msg[3]
            compressed_description = msg[4]
        except (IndexError, ValueError):
            self.logger.error("Invalid message from <%s> '%s'", hostname, msg)
            return

        try:
            job = TestJob.objects.get(id=job_id)
        except TestJob.DoesNotExist:
            self.logger.error("[%d] Unknown job", job_id)
            # ACK even if the job is unknown to let the dispatcher
            # forget about it
            send_multipart_u(self.controler, [hostname, 'END_OK', str(job_id)])
            return

        filename = os.path.join(job.output_dir, 'description.yaml')
        # If description.yaml already exists: a END was already received
        if os.path.exists(filename):
            self.logger.info("[%d] %s => END (duplicated), skipping", job_id, hostname)
        else:
            if compressed_description:
                self.logger.info("[%d] %s => END", job_id, hostname)
            else:
                self.logger.info("[%d] %s => END (lava-run crashed, mark job as INCOMPLETE)",
                                 job_id, hostname)
                with transaction.atomic():
                    # TODO: find a way to lock actual_device
                    job = TestJob.objects.select_for_update() \
                                         .get(id=job_id)

                    job.go_state_finished(TestJob.HEALTH_INCOMPLETE)
                    if error_msg:
                        self.logger.error("[%d] Error: %s", job_id, error_msg)
                        job.failure_comment = error_msg
                    job.save()

            # Create description.yaml even if it's empty
            # Allows to know when END messages are duplicated
            try:
                # Create the directory if it was not already created
                mkdir(os.path.dirname(filename))
                # TODO: check that compressed_description is not ""
                description = lzma.decompress(compressed_description)
                with open(filename, 'w') as f_description:
                    f_description.write(description.decode("utf-8"))
                if description:
                    parse_job_description(job)
            except (IOError, lzma.LZMAError) as exc:
                self.logger.error("[%d] Unable to dump 'description.yaml'",
                                  job_id)
                self.logger.exception("[%d] %s", job_id, exc)

        # ACK the job and mark the dispatcher as alive
        send_multipart_u(self.controler, [hostname, 'END_OK', str(job_id)])
        self.dispatcher_alive(hostname)
    def controler_socket(self):
        msg = self.controler.recv_multipart()
        # This is way to verbose for production and should only be activated
        # by (and for) developers
        # self.logger.debug("[CC] Receiving: %s", msg)

        # 1: the hostname (see ZMQ documentation)
        hostname = msg[0]
        # 2: the action
        action = msg[1]
        # Handle the actions
        if action == 'HELLO' or action == 'HELLO_RETRY':
            self.logger.info("%s => %s", hostname, action)

            # Check the protocol version
            try:
                slave_version = int(msg[2])
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname,
                                  msg)
                return False
            if slave_version != PROTOCOL_VERSION:
                self.logger.error(
                    "<%s> using protocol v%d while master is using v%d",
                    hostname, slave_version, PROTOCOL_VERSION)
                return False

            self.controler.send_multipart([hostname, 'HELLO_OK'])
            # If the dispatcher is known and sent an HELLO, means that
            # the slave has restarted
            if hostname in self.dispatchers:
                if action == 'HELLO':
                    self.logger.warning("Dispatcher <%s> has RESTARTED",
                                        hostname)
                else:
                    # Assume the HELLO command was received, and the
                    # action succeeded.
                    self.logger.warning("Dispatcher <%s> was not confirmed",
                                        hostname)
            else:
                # No dispatcher, treat HELLO and HELLO_RETRY as a normal HELLO
                # message.
                self.logger.warning("New dispatcher <%s>", hostname)
                self.dispatchers[hostname] = SlaveDispatcher(hostname,
                                                             online=True)

            if action == 'HELLO':
                # FIXME: slaves need to be allowed to restart cleanly without affecting jobs
                # as well as handling unexpected crashes.
                self._cancel_slave_dispatcher_jobs(hostname)

            # Mark the dispatcher as alive
            self.dispatchers[hostname].alive()

        elif action == 'PING':
            self.logger.debug("%s => PING", hostname)
            # Send back a signal
            self.controler.send_multipart([hostname, 'PONG'])
            self.dispatcher_alive(hostname)

        elif action == 'END':
            try:
                job_id = int(msg[2])
                job_status = int(msg[3])
                error_msg = msg[4]
                description = msg[5]
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname,
                                  msg)
                return False
            if job_status:
                status = TestJob.INCOMPLETE
                self.logger.info("[%d] %s => END with error %d", job_id,
                                 hostname, job_status)
                self.logger.error("[%d] Error: %s", job_id, error_msg)
            else:
                status = TestJob.COMPLETE
                self.logger.info("[%d] %s => END", job_id, hostname)

            # Find the corresponding job and update the status
            try:

                # Save the description
                job = TestJob.objects.get(id=job_id)
                filename = os.path.join(job.output_dir, 'description.yaml')
                try:
                    with open(filename, 'w') as f_description:
                        f_description.write(lzma.decompress(description))
                except (IOError, lzma.error) as exc:
                    self.logger.error("[%d] Unable to dump 'description.yaml'",
                                      job_id)
                    self.logger.exception(exc)
                parse_job_description(job)

                # Update status.
                with transaction.atomic():
                    job = TestJob.objects.select_for_update().get(id=job_id)
                    if job.status == TestJob.CANCELING:
                        cancel_job(job)
                    fail_job(job, fail_msg=error_msg, job_status=status)

            except TestJob.DoesNotExist:
                self.logger.error("[%d] Unknown job", job_id)
            # ACK even if the job is unknown to let the dispatcher
            # forget about it
            self.controler.send_multipart([hostname, 'END_OK', str(job_id)])
            self.dispatcher_alive(hostname)

        elif action == 'START_OK':
            try:
                job_id = int(msg[2])
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname,
                                  msg)
                return False
            self.logger.info("[%d] %s => START_OK", job_id, hostname)
            try:
                with transaction.atomic():
                    job = TestJob.objects.select_for_update() \
                                         .get(id=job_id)
                    start_job(job)
            except TestJob.DoesNotExist:
                self.logger.error("[%d] Unknown job", job_id)

            self.dispatcher_alive(hostname)

        else:
            self.logger.error("<%s> sent unknown action=%s, args=(%s)",
                              hostname, action, msg[1:])
        return True
Example #4
0
    def controler_socket(self):
        msg = self.controler.recv_multipart()
        # This is way to verbose for production and should only be activated
        # by (and for) developers
        # self.logger.debug("[CC] Receiving: %s", msg)

        # 1: the hostname (see ZMQ documentation)
        hostname = msg[0]
        # 2: the action
        action = msg[1]
        # Handle the actions
        if action == 'HELLO' or action == 'HELLO_RETRY':
            self.logger.info("%s => %s", hostname, action)

            # Check the protocol version
            try:
                slave_version = int(msg[2])
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname, msg)
                return False
            if slave_version != PROTOCOL_VERSION:
                self.logger.error("<%s> using protocol v%d while master is using v%d",
                                  hostname, slave_version, PROTOCOL_VERSION)
                return False

            self.controler.send_multipart([hostname, 'HELLO_OK'])
            # If the dispatcher is known and sent an HELLO, means that
            # the slave has restarted
            if hostname in self.dispatchers:
                if action == 'HELLO':
                    self.logger.warning("Dispatcher <%s> has RESTARTED",
                                        hostname)
                else:
                    # Assume the HELLO command was received, and the
                    # action succeeded.
                    self.logger.warning("Dispatcher <%s> was not confirmed",
                                        hostname)
            else:
                # No dispatcher, treat HELLO and HELLO_RETRY as a normal HELLO
                # message.
                self.logger.warning("New dispatcher <%s>", hostname)
                self.dispatchers[hostname] = SlaveDispatcher(hostname, online=True)

            if action == 'HELLO':
                # FIXME: slaves need to be allowed to restart cleanly without affecting jobs
                # as well as handling unexpected crashes.
                self._cancel_slave_dispatcher_jobs(hostname)

            # Mark the dispatcher as alive
            self.dispatchers[hostname].alive()

        elif action == 'PING':
            self.logger.debug("%s => PING", hostname)
            # Send back a signal
            self.controler.send_multipart([hostname, 'PONG'])
            self.dispatcher_alive(hostname)

        elif action == 'END':
            try:
                job_id = int(msg[2])
                job_status = int(msg[3])
                error_msg = msg[4]
                description = msg[5]
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname, msg)
                return False
            if job_status:
                status = TestJob.INCOMPLETE
                self.logger.info("[%d] %s => END with error %d", job_id, hostname, job_status)
                self.logger.error("[%d] Error: %s", job_id, error_msg)
            else:
                status = TestJob.COMPLETE
                self.logger.info("[%d] %s => END", job_id, hostname)

            # Find the corresponding job and update the status
            try:
                with transaction.atomic():
                    job = TestJob.objects.select_for_update().get(id=job_id)
                    if job.status == TestJob.CANCELING:
                        cancel_job(job)
                    fail_job(job, fail_msg=error_msg, job_status=status)

                # Save the description
                filename = os.path.join(job.output_dir, 'description.yaml')
                try:
                    with open(filename, 'w') as f_description:
                        f_description.write(lzma.decompress(description))
                except (IOError, lzma.error) as exc:
                    self.logger.error("[%d] Unable to dump 'description.yaml'",
                                      job_id)
                    self.logger.exception(exc)
                parse_job_description(job)

            except TestJob.DoesNotExist:
                self.logger.error("[%d] Unknown job", job_id)
            # ACK even if the job is unknown to let the dispatcher
            # forget about it
            self.controler.send_multipart([hostname, 'END_OK', str(job_id)])
            self.dispatcher_alive(hostname)

        elif action == 'START_OK':
            try:
                job_id = int(msg[2])
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname, msg)
                return False
            self.logger.info("[%d] %s => START_OK", job_id, hostname)
            try:
                with transaction.atomic():
                    job = TestJob.objects.select_for_update() \
                                         .get(id=job_id)
                    start_job(job)
            except TestJob.DoesNotExist:
                self.logger.error("[%d] Unknown job", job_id)

            self.dispatcher_alive(hostname)

        else:
            self.logger.error("<%s> sent unknown action=%s, args=(%s)",
                              hostname, action, msg[1:])
        return True