Ejemplo n.º 1
0
    def _cancel_slave_dispatcher_jobs(self, hostname):
        """Get dispatcher jobs and cancel them.

        :param hostname: The name of the dispatcher host.
        :type hostname: string
        """
        # TODO: DB: mark the dispatcher as online in the database.
        # For the moment this should not be done by this process as
        # some dispatchers are using old and new dispatcher.

        # Mark all jobs on this dispatcher as canceled.
        # The dispatcher had (re)started, so all jobs have to be
        # finished.
        with transaction.atomic():
            jobs = TestJob.objects.filter(
                actual_device__worker_host__hostname=hostname,
                is_pipeline=True,
                status=TestJob.RUNNING).select_for_update()

            for job in jobs:
                self.logger.info("[%d] Canceling", job.id)
                cancel_job(job)
Ejemplo n.º 2
0
    def _cancel_slave_dispatcher_jobs(self, hostname):
        """Get dispatcher jobs and cancel them.

        :param hostname: The name of the dispatcher host.
        :type hostname: string
        """
        # TODO: DB: mark the dispatcher as online in the database.
        # For the moment this should not be done by this process as
        # some dispatchers are using old and new dispatcher.

        # Mark all jobs on this dispatcher as canceled.
        # The dispatcher had (re)started, so all jobs have to be
        # finished.
        with transaction.atomic():
            jobs = TestJob.objects.filter(
                actual_device__worker_host__hostname=hostname,
                is_pipeline=True,
                status=TestJob.RUNNING).select_for_update()

            for job in jobs:
                self.logger.info("[%d] Canceling", job.id)
                cancel_job(job)
Ejemplo n.º 3
0
    def controler_socket(self):
        msg = self.controler.recv_multipart()
        # This is way to verbose for production and should only be activated
        # by (and for) developers
        # self.logger.debug("[CC] Receiving: %s", msg)

        # 1: the hostname (see ZMQ documentation)
        hostname = msg[0]
        # 2: the action
        action = msg[1]
        # Handle the actions
        if action == 'HELLO' or action == 'HELLO_RETRY':
            self.logger.info("%s => %s", hostname, action)

            # Check the protocol version
            try:
                slave_version = int(msg[2])
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname,
                                  msg)
                return False
            if slave_version != PROTOCOL_VERSION:
                self.logger.error(
                    "<%s> using protocol v%d while master is using v%d",
                    hostname, slave_version, PROTOCOL_VERSION)
                return False

            self.controler.send_multipart([hostname, 'HELLO_OK'])
            # If the dispatcher is known and sent an HELLO, means that
            # the slave has restarted
            if hostname in self.dispatchers:
                if action == 'HELLO':
                    self.logger.warning("Dispatcher <%s> has RESTARTED",
                                        hostname)
                else:
                    # Assume the HELLO command was received, and the
                    # action succeeded.
                    self.logger.warning("Dispatcher <%s> was not confirmed",
                                        hostname)
            else:
                # No dispatcher, treat HELLO and HELLO_RETRY as a normal HELLO
                # message.
                self.logger.warning("New dispatcher <%s>", hostname)
                self.dispatchers[hostname] = SlaveDispatcher(hostname,
                                                             online=True)

            if action == 'HELLO':
                # FIXME: slaves need to be allowed to restart cleanly without affecting jobs
                # as well as handling unexpected crashes.
                self._cancel_slave_dispatcher_jobs(hostname)

            # Mark the dispatcher as alive
            self.dispatchers[hostname].alive()

        elif action == 'PING':
            self.logger.debug("%s => PING", hostname)
            # Send back a signal
            self.controler.send_multipart([hostname, 'PONG'])
            self.dispatcher_alive(hostname)

        elif action == 'END':
            try:
                job_id = int(msg[2])
                job_status = int(msg[3])
                error_msg = msg[4]
                description = msg[5]
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname,
                                  msg)
                return False
            if job_status:
                status = TestJob.INCOMPLETE
                self.logger.info("[%d] %s => END with error %d", job_id,
                                 hostname, job_status)
                self.logger.error("[%d] Error: %s", job_id, error_msg)
            else:
                status = TestJob.COMPLETE
                self.logger.info("[%d] %s => END", job_id, hostname)

            # Find the corresponding job and update the status
            try:

                # Save the description
                job = TestJob.objects.get(id=job_id)
                filename = os.path.join(job.output_dir, 'description.yaml')
                try:
                    with open(filename, 'w') as f_description:
                        f_description.write(lzma.decompress(description))
                except (IOError, lzma.error) as exc:
                    self.logger.error("[%d] Unable to dump 'description.yaml'",
                                      job_id)
                    self.logger.exception(exc)
                parse_job_description(job)

                # Update status.
                with transaction.atomic():
                    job = TestJob.objects.select_for_update().get(id=job_id)
                    if job.status == TestJob.CANCELING:
                        cancel_job(job)
                    fail_job(job, fail_msg=error_msg, job_status=status)

            except TestJob.DoesNotExist:
                self.logger.error("[%d] Unknown job", job_id)
            # ACK even if the job is unknown to let the dispatcher
            # forget about it
            self.controler.send_multipart([hostname, 'END_OK', str(job_id)])
            self.dispatcher_alive(hostname)

        elif action == 'START_OK':
            try:
                job_id = int(msg[2])
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname,
                                  msg)
                return False
            self.logger.info("[%d] %s => START_OK", job_id, hostname)
            try:
                with transaction.atomic():
                    job = TestJob.objects.select_for_update() \
                                         .get(id=job_id)
                    start_job(job)
            except TestJob.DoesNotExist:
                self.logger.error("[%d] Unknown job", job_id)

            self.dispatcher_alive(hostname)

        else:
            self.logger.error("<%s> sent unknown action=%s, args=(%s)",
                              hostname, action, msg[1:])
        return True
Ejemplo n.º 4
0
    def controler_socket(self):
        msg = self.controler.recv_multipart()
        # This is way to verbose for production and should only be activated
        # by (and for) developers
        # self.logger.debug("[CC] Receiving: %s", msg)

        # 1: the hostname (see ZMQ documentation)
        hostname = msg[0]
        # 2: the action
        action = msg[1]
        # Handle the actions
        if action == 'HELLO' or action == 'HELLO_RETRY':
            self.logger.info("%s => %s", hostname, action)

            # Check the protocol version
            try:
                slave_version = int(msg[2])
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname, msg)
                return False
            if slave_version != PROTOCOL_VERSION:
                self.logger.error("<%s> using protocol v%d while master is using v%d",
                                  hostname, slave_version, PROTOCOL_VERSION)
                return False

            self.controler.send_multipart([hostname, 'HELLO_OK'])
            # If the dispatcher is known and sent an HELLO, means that
            # the slave has restarted
            if hostname in self.dispatchers:
                if action == 'HELLO':
                    self.logger.warning("Dispatcher <%s> has RESTARTED",
                                        hostname)
                else:
                    # Assume the HELLO command was received, and the
                    # action succeeded.
                    self.logger.warning("Dispatcher <%s> was not confirmed",
                                        hostname)
            else:
                # No dispatcher, treat HELLO and HELLO_RETRY as a normal HELLO
                # message.
                self.logger.warning("New dispatcher <%s>", hostname)
                self.dispatchers[hostname] = SlaveDispatcher(hostname, online=True)

            if action == 'HELLO':
                # FIXME: slaves need to be allowed to restart cleanly without affecting jobs
                # as well as handling unexpected crashes.
                self._cancel_slave_dispatcher_jobs(hostname)

            # Mark the dispatcher as alive
            self.dispatchers[hostname].alive()

        elif action == 'PING':
            self.logger.debug("%s => PING", hostname)
            # Send back a signal
            self.controler.send_multipart([hostname, 'PONG'])
            self.dispatcher_alive(hostname)

        elif action == 'END':
            try:
                job_id = int(msg[2])
                job_status = int(msg[3])
                error_msg = msg[4]
                description = msg[5]
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname, msg)
                return False
            if job_status:
                status = TestJob.INCOMPLETE
                self.logger.info("[%d] %s => END with error %d", job_id, hostname, job_status)
                self.logger.error("[%d] Error: %s", job_id, error_msg)
            else:
                status = TestJob.COMPLETE
                self.logger.info("[%d] %s => END", job_id, hostname)

            # Find the corresponding job and update the status
            try:
                with transaction.atomic():
                    job = TestJob.objects.select_for_update().get(id=job_id)
                    if job.status == TestJob.CANCELING:
                        cancel_job(job)
                    fail_job(job, fail_msg=error_msg, job_status=status)

                # Save the description
                filename = os.path.join(job.output_dir, 'description.yaml')
                try:
                    with open(filename, 'w') as f_description:
                        f_description.write(lzma.decompress(description))
                except (IOError, lzma.error) as exc:
                    self.logger.error("[%d] Unable to dump 'description.yaml'",
                                      job_id)
                    self.logger.exception(exc)
                parse_job_description(job)

            except TestJob.DoesNotExist:
                self.logger.error("[%d] Unknown job", job_id)
            # ACK even if the job is unknown to let the dispatcher
            # forget about it
            self.controler.send_multipart([hostname, 'END_OK', str(job_id)])
            self.dispatcher_alive(hostname)

        elif action == 'START_OK':
            try:
                job_id = int(msg[2])
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname, msg)
                return False
            self.logger.info("[%d] %s => START_OK", job_id, hostname)
            try:
                with transaction.atomic():
                    job = TestJob.objects.select_for_update() \
                                         .get(id=job_id)
                    start_job(job)
            except TestJob.DoesNotExist:
                self.logger.error("[%d] Unknown job", job_id)

            self.dispatcher_alive(hostname)

        else:
            self.logger.error("<%s> sent unknown action=%s, args=(%s)",
                              hostname, action, msg[1:])
        return True
Ejemplo n.º 5
0
    def controler_socket(self):
        msg = self.controler.recv_multipart()
        self.logger.debug("[CC] Receiving: %s", msg)

        # 1: the hostname (see ZMQ documentation)
        hostname = msg[0]
        # 2: the action
        action = msg[1]
        # Handle the actions
        if action == 'HELLO':
            self.logger.info("%s => %s", hostname, action)
            self.controler.send_multipart([hostname, 'HELLO_OK'])
            # If the dispatcher is known and sent an HELLO, means that
            # the slave has restarted
            if hostname in self.dispatchers:
                self.logger.warning("Dispatcher <%s> has RESTARTED", hostname)
            else:
                self.logger.warning("New dispatcher <%s>", hostname)
                self.dispatchers[hostname] = SlaveDispatcher(hostname, online=True)
            # FIXME: slaves need to be allowed to restart cleanly without affecting jobs
            # as well as handling unexpected crashes.
            self._cancel_slave_dispatcher_jobs(hostname)

            # Mark the dispatcher as alive
            self.dispatchers[hostname].alive()

        elif action == "HELLO_RETRY":
            self.logger.info("%s => HELLO_RETRY", hostname)
            self.controler.send_multipart([hostname, "HELLO_OK"])

            if hostname in self.dispatchers:
                # Assume the HELLO command was received, and the
                # action succeeded.
                self.logger.warning(
                    "Dispatcher <%s> was not confirmed", hostname)
            else:
                # No dispatcher, treat it as a normal HELLO message.
                self.logger.warning("New dispatcher <%s>", hostname)
                self.dispatchers[hostname] = SlaveDispatcher(
                    hostname, online=True)

                self._cancel_slave_dispatcher_jobs(hostname)

            # Mark the dispatcher as alive
            self.dispatchers[hostname].alive()

        elif action == 'PING':
            self.logger.debug("%s => PING", hostname)
            # Send back a signal
            self.controler.send_multipart([hostname, 'PONG'])

            if hostname not in self.dispatchers:
                # The server crashed: send a STATUS message
                self.logger.warning("Unknown dispatcher <%s> (server crashed)", hostname)
                self.dispatchers[hostname] = SlaveDispatcher(hostname, online=True)
                send_status(hostname, self.controler, self.logger)

            # Mark the dispatcher as alive
            self.dispatchers[hostname].alive()

        elif action == "ERROR":
            try:
                job_id = int(msg[2])
                error_msg = str(msg[3])
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname, msg[:50])
                return False
            self.logger.error("[%d] Error: %s", job_id, error_msg)

            # Mark the dispatcher as alive
            self.dispatchers[hostname].alive()

        elif action == 'END':
            status = TestJob.COMPLETE
            try:
                job_id = int(msg[2])
                job_status = int(msg[3])
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname, msg)
                return False
            if job_status:
                self.logger.info("[%d] %s => END with error %d", job_id, hostname, job_status)
                status = TestJob.INCOMPLETE
            else:
                self.logger.info("[%d] %s => END", job_id, hostname)
            try:
                with transaction.atomic():
                    job = TestJob.objects.select_for_update() \
                                         .get(id=job_id)
                    if job.status == TestJob.CANCELING:
                        cancel_job(job)
                    else:
                        end_job(job, job_status=status)
            except TestJob.DoesNotExist:
                self.logger.error("[%d] Unknown job", job_id)
            # ACK even if the job is unknown to let the dispatcher
            # forget about it
            self.controler.send_multipart([hostname, 'END_OK', str(job_id)])

            if hostname not in self.dispatchers:
                # The server crashed: send a STATUS message
                self.logger.warning("Unknown dispatcher <%s> (server crashed)", hostname)
                self.dispatchers[hostname] = SlaveDispatcher(hostname, online=True)
                send_status(hostname, self.controler, self.logger)

            # Mark the dispatcher as alive
            self.dispatchers[hostname].alive()

        elif action == 'START_OK':
            try:
                job_id = int(msg[2])
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname, msg)
                return False
            self.logger.info("[%d] %s => START_OK", job_id, hostname)
            try:
                with transaction.atomic():
                    job = TestJob.objects.select_for_update() \
                                         .get(id=job_id)
                    start_job(job)
            except TestJob.DoesNotExist:
                self.logger.error("[%d] Unknown job", job_id)

            if hostname not in self.dispatchers:
                # The server crashed: send a STATUS message
                self.logger.warning("Unknown dispatcher <%s> (server crashed)", hostname)
                self.dispatchers[hostname] = SlaveDispatcher(hostname, online=True)
                send_status(hostname, self.controler, self.logger)

            # Mark the dispatcher as alive
            self.dispatchers[hostname].alive()

        else:
            self.logger.error("<%s> sent unknown action=%s, args=(%s)",
                              hostname, action, msg[1:])
        return True