Esempi in Python per fail_job, esempi in Python per lava_scheduler_app.dbutils.fail_job

Esempio n. 1

0

Mostra file

File: dispatcher-master.py Progetto: dl9pf/lava-server

 def handle_canceling(self):
     for job in TestJob.objects.filter(status=TestJob.CANCELING, is_pipeline=True):
         worker_host = job.lookup_worker if job.dynamic_connection else job.actual_device.worker_host
         if not worker_host:
             self.logger.warning("[%d] Invalid worker information", job.id)
             # shouldn't happen
             fail_job(job, 'invalid worker information', TestJob.CANCELED)
             continue
         self.logger.info("[%d] CANCEL => %s", job.id,
                          worker_host.hostname)
         self.controler.send_multipart([str(worker_host.hostname),
                                        'CANCEL', str(job.id)])

Esempio n. 2

0

Mostra file

File: dispatcher-master.py Progetto: mripard/lava-server

 def handle_canceling(self):
     for job in TestJob.objects.filter(status=TestJob.CANCELING, is_pipeline=True):
         worker_host = job.lookup_worker if job.dynamic_connection else job.actual_device.worker_host
         if not worker_host:
             self.logger.warning("[%d] Invalid worker information", job.id)
             # shouldn't happen
             fail_job(job, 'invalid worker information', TestJob.CANCELED)
             continue
         self.logger.info("[%d] CANCEL => %s", job.id,
                          worker_host.hostname)
         self.controler.send_multipart([str(worker_host.hostname),
                                        'CANCEL', str(job.id)])

Esempio n. 3

0

Mostra file

File: dispatcher-master.py Progetto: kernelci/lava-server

    def process_jobs(self, options):
        for job in TestJob.objects.filter(
                Q(status=TestJob.SUBMITTED) & Q(is_pipeline=True) & ~Q(actual_device=None))\
                .order_by('-health_check', '-priority', 'submit_time', 'target_group', 'id'):
            device = None
            worker_host = None

            device = select_device(job, self.dispatchers)
            if not device:
                # e.g. one or more jobs in the MultiNode group do not yet have Reserved devices.
                continue
            # selecting device can change the job
            job.refresh_from_db()

            self.logger.info("[%d] Assigning %s device", job.id, device)
            if job.actual_device is None:
                # health checks
                device = job.requested_device
                if not device.worker_host:
                    msg = "Infrastructure error: Invalid worker information"
                    self.logger.error("[%d] %s", job.id, msg)
                    fail_job(job, msg, TestJob.INCOMPLETE)
                    continue
            # Launch the job
            create_job(job, device)
            self.logger.info("[%d] START => %s (%s)", job.id,
                             device.worker_host.hostname, device.hostname)
            worker_host = device.worker_host
            try:
                # Load job definition to get the variables for template
                # rendering
                job_def = yaml.load(job.definition)
                job_ctx = job_def.get('context', {})

                # Load env.yaml, env-dut.yaml and dispatcher configuration
                # All three are optional
                env_str = load_optional_yaml_file(options['env'])
                env_dut_str = load_optional_yaml_file(options['env_dut'])

                # Load device configuration
                if device:
                    device_configuration = device.load_configuration(job_ctx)
                    dispatcher_config_file = os.path.join(
                        options['dispatchers_config'],
                        "%s.yaml" % worker_host.hostname)
                    dispatcher_config = load_optional_yaml_file(
                        dispatcher_config_file)

                    self.controler.send_multipart([
                        str(worker_host.hostname), 'START',
                        str(job.id),
                        self.export_definition(job),
                        str(device_configuration), dispatcher_config, env_str,
                        env_dut_str
                    ])

                if job.is_multinode:
                    # All secondary connections must be made from a dispatcher local to the one host device
                    # to allow for local firewalls etc. So the secondary connection is started on the
                    # remote worker of the "nominated" host.
                    # This job will not be a dynamic_connection, this is the parent.
                    device = None
                    device_configuration = None
                    # to get this far, the rest of the multinode group must also be ready
                    # so start the dynamic connections
                    parent = job

                    for group_job in job.sub_jobs_list:
                        if group_job == parent or not group_job.dynamic_connection:
                            continue

                        worker_host = parent.actual_device.worker_host
                        dispatcher_config_file = os.path.join(
                            options['dispatchers_config'],
                            "%s.yaml" % worker_host.hostname)
                        dispatcher_config = load_optional_yaml_file(
                            dispatcher_config_file)

                        # inherit only enough configuration for dynamic_connection operation
                        device_configuration = parent.actual_device.load_configuration(
                            job_ctx)
                        self.logger.info(
                            "[%d] Trimming dynamic connection device configuration.",
                            group_job.id)
                        device_configuration = parent.actual_device.minimise_configuration(
                            device_configuration)

                        self.logger.info("[%d] START => %s (connection)",
                                         group_job.id, worker_host.hostname)
                        self.controler.send_multipart([
                            str(worker_host.hostname), 'START',
                            str(group_job.id),
                            self.export_definition(group_job),
                            str(device_configuration), dispatcher_config,
                            env_str, env_dut_str
                        ])
                continue

            except jinja2.TemplateNotFound as exc:
                self.logger.error("[%d] Template not found: '%s'", job.id,
                                  exc.message)
                msg = "Infrastructure error: Template not found: '%s'" % \
                      exc.message
            except jinja2.TemplateSyntaxError as exc:
                self.logger.error(
                    "[%d] Template syntax error in '%s', line %d: %s", job.id,
                    exc.name, exc.lineno, exc.message)
                msg = "Infrastructure error: Template syntax error in '%s', line %d: %s" % \
                      (exc.name, exc.lineno, exc.message)
            except IOError as exc:
                self.logger.error("[%d] Unable to read '%s': %s", job.id,
                                  exc.filename, exc.strerror)
                msg = "Infrastructure error: cannot open '%s': %s" % \
                      (exc.filename, exc.strerror)
            except yaml.YAMLError as exc:
                self.logger.error("[%d] Unable to parse job definition: %s",
                                  job.id, exc)
                msg = "Infrastructure error: cannot parse job definition: %s" % \
                      exc

            self.logger.error("[%d] INCOMPLETE job", job.id)
            fail_job(job=job, fail_msg=msg, job_status=TestJob.INCOMPLETE)

Esempio n. 4

0

Mostra file

File: dispatcher-master.py Progetto: kernelci/lava-server

    def controler_socket(self):
        msg = self.controler.recv_multipart()
        # This is way to verbose for production and should only be activated
        # by (and for) developers
        # self.logger.debug("[CC] Receiving: %s", msg)

        # 1: the hostname (see ZMQ documentation)
        hostname = msg[0]
        # 2: the action
        action = msg[1]
        # Handle the actions
        if action == 'HELLO' or action == 'HELLO_RETRY':
            self.logger.info("%s => %s", hostname, action)

            # Check the protocol version
            try:
                slave_version = int(msg[2])
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname,
                                  msg)
                return False
            if slave_version != PROTOCOL_VERSION:
                self.logger.error(
                    "<%s> using protocol v%d while master is using v%d",
                    hostname, slave_version, PROTOCOL_VERSION)
                return False

            self.controler.send_multipart([hostname, 'HELLO_OK'])
            # If the dispatcher is known and sent an HELLO, means that
            # the slave has restarted
            if hostname in self.dispatchers:
                if action == 'HELLO':
                    self.logger.warning("Dispatcher <%s> has RESTARTED",
                                        hostname)
                else:
                    # Assume the HELLO command was received, and the
                    # action succeeded.
                    self.logger.warning("Dispatcher <%s> was not confirmed",
                                        hostname)
            else:
                # No dispatcher, treat HELLO and HELLO_RETRY as a normal HELLO
                # message.
                self.logger.warning("New dispatcher <%s>", hostname)
                self.dispatchers[hostname] = SlaveDispatcher(hostname,
                                                             online=True)

            if action == 'HELLO':
                # FIXME: slaves need to be allowed to restart cleanly without affecting jobs
                # as well as handling unexpected crashes.
                self._cancel_slave_dispatcher_jobs(hostname)

            # Mark the dispatcher as alive
            self.dispatchers[hostname].alive()

        elif action == 'PING':
            self.logger.debug("%s => PING", hostname)
            # Send back a signal
            self.controler.send_multipart([hostname, 'PONG'])
            self.dispatcher_alive(hostname)

        elif action == 'END':
            try:
                job_id = int(msg[2])
                job_status = int(msg[3])
                error_msg = msg[4]
                description = msg[5]
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname,
                                  msg)
                return False
            if job_status:
                status = TestJob.INCOMPLETE
                self.logger.info("[%d] %s => END with error %d", job_id,
                                 hostname, job_status)
                self.logger.error("[%d] Error: %s", job_id, error_msg)
            else:
                status = TestJob.COMPLETE
                self.logger.info("[%d] %s => END", job_id, hostname)

            # Find the corresponding job and update the status
            try:

                # Save the description
                job = TestJob.objects.get(id=job_id)
                filename = os.path.join(job.output_dir, 'description.yaml')
                try:
                    with open(filename, 'w') as f_description:
                        f_description.write(lzma.decompress(description))
                except (IOError, lzma.error) as exc:
                    self.logger.error("[%d] Unable to dump 'description.yaml'",
                                      job_id)
                    self.logger.exception(exc)
                parse_job_description(job)

                # Update status.
                with transaction.atomic():
                    job = TestJob.objects.select_for_update().get(id=job_id)
                    if job.status == TestJob.CANCELING:
                        cancel_job(job)
                    fail_job(job, fail_msg=error_msg, job_status=status)

            except TestJob.DoesNotExist:
                self.logger.error("[%d] Unknown job", job_id)
            # ACK even if the job is unknown to let the dispatcher
            # forget about it
            self.controler.send_multipart([hostname, 'END_OK', str(job_id)])
            self.dispatcher_alive(hostname)

        elif action == 'START_OK':
            try:
                job_id = int(msg[2])
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname,
                                  msg)
                return False
            self.logger.info("[%d] %s => START_OK", job_id, hostname)
            try:
                with transaction.atomic():
                    job = TestJob.objects.select_for_update() \
                                         .get(id=job_id)
                    start_job(job)
            except TestJob.DoesNotExist:
                self.logger.error("[%d] Unknown job", job_id)

            self.dispatcher_alive(hostname)

        else:
            self.logger.error("<%s> sent unknown action=%s, args=(%s)",
                              hostname, action, msg[1:])
        return True

Esempio n. 5

0

Mostra file

File: dispatcher-master.py Progetto: dl9pf/lava-server

    def process_jobs(self, options):
        for job in TestJob.objects.filter(
                Q(status=TestJob.SUBMITTED) & Q(is_pipeline=True) & ~Q(actual_device=None))\
                .order_by('-health_check', '-priority', 'submit_time', 'target_group', 'id'):
            if job.dynamic_connection:
                # A secondary connection must be made from a dispatcher local to the host device
                # to allow for local firewalls etc. So the secondary connection is started on the
                # remote worker of the "nominated" host.
                # FIXME:
                device = None
                worker_host = job.lookup_worker
                self.logger.info("[%d] START => %s (connection)", job.id,
                                 worker_host.hostname)
            else:
                device = select_device(job, self.dispatchers)
                if not device:
                    continue
                # selecting device can change the job
                job = TestJob.objects.get(id=job.id)
                self.logger.info("[%d] Assigning %s device", job.id, device)
                if job.actual_device is None:
                    device = job.requested_device
                    if not device.worker_host:
                        msg = "Infrastructure error: Invalid worker information"
                        self.logger.error("[%d] %s", job.id, msg)
                        fail_job(job, msg, TestJob.INCOMPLETE)
                        continue

                    # Launch the job
                    create_job(job, device)
                    self.logger.info("[%d] START => %s (%s)", job.id,
                                     device.worker_host.hostname, device.hostname)
                    worker_host = device.worker_host
                else:
                    device = job.actual_device
                    if not device.worker_host:
                        msg = "Infrastructure error: Invalid worker information"
                        self.logger.error("[%d] %s", job.id, msg)
                        fail_job(job, msg, TestJob.INCOMPLETE)
                        continue
                    self.logger.info("[%d] START => %s (%s) (retrying)", job.id,
                                     device.worker_host.hostname, device.hostname)
                    worker_host = device.worker_host
            try:
                # Load job definition to get the variables for template
                # rendering
                job_def = yaml.load(job.definition)
                job_ctx = job_def.get('context', {})

                # Load device configuration
                device_configuration = '' \
                    if job.dynamic_connection else device.load_device_configuration(job_ctx)

                # Load env.yaml, env-dut.yaml and dispatcher configuration
                # All three are optional
                env_str = load_optional_yaml_file(options['env'])
                env_dut_str = load_optional_yaml_file(options['env_dut'])
                dispatcher_config_file = os.path.join(options['dispatchers_config'],
                                                      "%s.yaml" % worker_host.hostname)
                dispatcher_config = load_optional_yaml_file(dispatcher_config_file)

                if job.is_multinode:
                    for group_job in job.sub_jobs_list:
                        if group_job.dynamic_connection:
                            # to get this far, the rest of the multinode group must also be ready
                            # so start the dynamic connections
                            # FIXME: rationalise and streamline
                            self.controler.send_multipart(
                                [str(worker_host.hostname),
                                 'START', str(group_job.id),
                                 self.export_definition(group_job),
                                 str(device_configuration),
                                 dispatcher_config,
                                 env_str, env_dut_str])

                self.controler.send_multipart(
                    [str(worker_host.hostname),
                     'START', str(job.id),
                     self.export_definition(job),
                     str(device_configuration),
                     dispatcher_config,
                     env_str, env_dut_str])
                continue

            except jinja2.TemplateNotFound as exc:
                self.logger.error("[%d] Template not found: '%s'",
                                  job.id, exc.message)
                msg = "Infrastructure error: Template not found: '%s'" % \
                      exc.message
            except jinja2.TemplateSyntaxError as exc:
                self.logger.error("[%d] Template syntax error in '%s', line %d: %s",
                                  job.id, exc.name, exc.lineno, exc.message)
                msg = "Infrastructure error: Template syntax error in '%s', line %d: %s" % \
                      (exc.name, exc.lineno, exc.message)
            except IOError as exc:
                self.logger.error("[%d] Unable to read '%s': %s",
                                  job.id, exc.filename, exc.strerror)
                msg = "Infrastructure error: cannot open '%s': %s" % \
                      (exc.filename, exc.strerror)
            except yaml.YAMLError as exc:
                self.logger.error("[%d] Unable to parse job definition: %s",
                                  job.id, exc)
                msg = "Infrastructure error: cannot parse job definition: %s" % \
                      exc

            self.logger.error("[%d] INCOMPLETE job", job.id)
            fail_job(job=job, fail_msg=msg, job_status=TestJob.INCOMPLETE)

Esempio n. 6

0

Mostra file

File: dispatcher-master.py Progetto: dl9pf/lava-server

    def controler_socket(self):
        msg = self.controler.recv_multipart()
        # This is way to verbose for production and should only be activated
        # by (and for) developers
        # self.logger.debug("[CC] Receiving: %s", msg)

        # 1: the hostname (see ZMQ documentation)
        hostname = msg[0]
        # 2: the action
        action = msg[1]
        # Handle the actions
        if action == 'HELLO' or action == 'HELLO_RETRY':
            self.logger.info("%s => %s", hostname, action)

            # Check the protocol version
            try:
                slave_version = int(msg[2])
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname, msg)
                return False
            if slave_version != PROTOCOL_VERSION:
                self.logger.error("<%s> using protocol v%d while master is using v%d",
                                  hostname, slave_version, PROTOCOL_VERSION)
                return False

            self.controler.send_multipart([hostname, 'HELLO_OK'])
            # If the dispatcher is known and sent an HELLO, means that
            # the slave has restarted
            if hostname in self.dispatchers:
                if action == 'HELLO':
                    self.logger.warning("Dispatcher <%s> has RESTARTED",
                                        hostname)
                else:
                    # Assume the HELLO command was received, and the
                    # action succeeded.
                    self.logger.warning("Dispatcher <%s> was not confirmed",
                                        hostname)
            else:
                # No dispatcher, treat HELLO and HELLO_RETRY as a normal HELLO
                # message.
                self.logger.warning("New dispatcher <%s>", hostname)
                self.dispatchers[hostname] = SlaveDispatcher(hostname, online=True)

            if action == 'HELLO':
                # FIXME: slaves need to be allowed to restart cleanly without affecting jobs
                # as well as handling unexpected crashes.
                self._cancel_slave_dispatcher_jobs(hostname)

            # Mark the dispatcher as alive
            self.dispatchers[hostname].alive()

        elif action == 'PING':
            self.logger.debug("%s => PING", hostname)
            # Send back a signal
            self.controler.send_multipart([hostname, 'PONG'])
            self.dispatcher_alive(hostname)

        elif action == 'END':
            try:
                job_id = int(msg[2])
                job_status = int(msg[3])
                error_msg = msg[4]
                description = msg[5]
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname, msg)
                return False
            if job_status:
                status = TestJob.INCOMPLETE
                self.logger.info("[%d] %s => END with error %d", job_id, hostname, job_status)
                self.logger.error("[%d] Error: %s", job_id, error_msg)
            else:
                status = TestJob.COMPLETE
                self.logger.info("[%d] %s => END", job_id, hostname)

            # Find the corresponding job and update the status
            try:
                with transaction.atomic():
                    job = TestJob.objects.select_for_update().get(id=job_id)
                    if job.status == TestJob.CANCELING:
                        cancel_job(job)
                    fail_job(job, fail_msg=error_msg, job_status=status)

                # Save the description
                filename = os.path.join(job.output_dir, 'description.yaml')
                try:
                    with open(filename, 'w') as f_description:
                        f_description.write(lzma.decompress(description))
                except (IOError, lzma.error) as exc:
                    self.logger.error("[%d] Unable to dump 'description.yaml'",
                                      job_id)
                    self.logger.exception(exc)
                parse_job_description(job)

            except TestJob.DoesNotExist:
                self.logger.error("[%d] Unknown job", job_id)
            # ACK even if the job is unknown to let the dispatcher
            # forget about it
            self.controler.send_multipart([hostname, 'END_OK', str(job_id)])
            self.dispatcher_alive(hostname)

        elif action == 'START_OK':
            try:
                job_id = int(msg[2])
            except (IndexError, ValueError):
                self.logger.error("Invalid message from <%s> '%s'", hostname, msg)
                return False
            self.logger.info("[%d] %s => START_OK", job_id, hostname)
            try:
                with transaction.atomic():
                    job = TestJob.objects.select_for_update() \
                                         .get(id=job_id)
                    start_job(job)
            except TestJob.DoesNotExist:
                self.logger.error("[%d] Unknown job", job_id)

            self.dispatcher_alive(hostname)

        else:
            self.logger.error("<%s> sent unknown action=%s, args=(%s)",
                              hostname, action, msg[1:])
        return True

Esempio n. 7

0

Mostra file

File: dispatcher-master.py Progetto: cdaudt/lava-server

    def process_jobs(self, options):
        for job in TestJob.objects.filter(
                Q(status=TestJob.SUBMITTED) & Q(is_pipeline=True) & ~Q(actual_device=None))\
                .order_by('-health_check', '-priority', 'submit_time', 'target_group', 'id'):
            if job.dynamic_connection:
                # A secondary connection must be made from a dispatcher local to the host device
                # to allow for local firewalls etc. So the secondary connection is started on the
                # remote worker of the "nominated" host.
                # FIXME:
                device = None
                worker_host = job.lookup_worker
                self.logger.info("[%d] START => %s (connection)", job.id,
                                 worker_host.hostname)
            else:
                device = select_device(job, self.dispatchers)
                if not device:
                    return False
                # selecting device can change the job
                job = TestJob.objects.get(id=job.id)
                self.logger.info("[%d] Assigning %s device", job.id, device)
                if job.actual_device is None:
                    device = job.requested_device
                    if not device.worker_host:
                        msg = "Infrastructure error: Invalid worker information"
                        self.logger.error("[%d] %s", job.id, msg)
                        fail_job(job, msg, TestJob.INCOMPLETE)
                        return False

                    # Launch the job
                    create_job(job, device)
                    self.logger.info("[%d] START => %s (%s)", job.id,
                                     device.worker_host.hostname,
                                     device.hostname)
                    worker_host = device.worker_host
                else:
                    device = job.actual_device
                    if not device.worker_host:
                        msg = "Infrastructure error: Invalid worker information"
                        self.logger.error("[%d] %s", job.id, msg)
                        fail_job(job, msg, TestJob.INCOMPLETE)
                        return False
                    self.logger.info("[%d] START => %s (%s) (retrying)",
                                     job.id, device.worker_host.hostname,
                                     device.hostname)
                    worker_host = device.worker_host
            try:
                # Load job definition to get the variables for template
                # rendering
                job_def = yaml.load(job.definition)
                job_ctx = job_def.get('context', {})

                # Load device configuration
                device_configuration = None \
                    if job.dynamic_connection else device.load_device_configuration(job_ctx)

                env_str = get_env_string(options['env'])
                env_dut_str = get_env_string(options['env_dut'])

                if job.is_multinode:
                    for group_job in job.sub_jobs_list:
                        if group_job.dynamic_connection:
                            # to get this far, the rest of the multinode group must also be ready
                            # so start the dynamic connections
                            # FIXME: rationalise and streamline
                            self.controler.send_multipart([
                                str(worker_host.hostname), 'START',
                                str(group_job.id),
                                self.export_definition(group_job),
                                str(device_configuration), env_str, env_dut_str
                            ])

                self.controler.send_multipart([
                    str(worker_host.hostname), 'START',
                    str(job.id),
                    self.export_definition(job),
                    str(device_configuration), env_str, env_dut_str
                ])

            except (jinja2.TemplateError, IOError, yaml.YAMLError) as exc:
                if isinstance(exc, jinja2.TemplateNotFound):
                    self.logger.error("Template not found: '%s'", exc.message)
                    msg = "Infrastructure error: Template not found: '%s'" % \
                          exc.message
                elif isinstance(exc, jinja2.TemplateSyntaxError):
                    self.logger.error(
                        "Template syntax error in '%s', line %d: %s", exc.name,
                        exc.lineno, exc.message)
                    msg = "Infrastructure error: Template syntax error in '%s', line %d: %s" % \
                          (exc.name, exc.lineno, exc.message)
                elif isinstance(exc, IOError):
                    self.logger.error("Unable to read '%s': %s",
                                      options['env'], exc.strerror)
                    msg = "Infrastructure error: cannot open '%s': %s" % \
                          (options['env'], exc.strerror)
                elif isinstance(exc, yaml.YAMLError):
                    self.logger.error("Unable to parse job definition: %s",
                                      exc)
                    msg = "Infrastructure error: cannot parse job definition: %s" % \
                          exc
                else:
                    self.logger.exception(exc)
                    msg = "Infrastructure error: %s" % exc.message

                self.logger.error("[%d] INCOMPLETE job", job.id)
                fail_job(job=job, fail_msg=msg, job_status=TestJob.INCOMPLETE)
        return True

Esempio n. 8

0

Mostra file

File: dispatcher-master.py Progetto: margam2410/lava-server

    def process_jobs(self, options):
        for job in TestJob.objects.filter(
                Q(status=TestJob.SUBMITTED) & Q(is_pipeline=True) & ~Q(actual_device=None))\
                .order_by('-health_check', '-priority', 'submit_time', 'target_group', 'id'):
            if job.dynamic_connection:
                # A secondary connection must be made from a dispatcher local to the host device
                # to allow for local firewalls etc. So the secondary connection is started on the
                # remote worker of the "nominated" host.
                # FIXME:
                worker_host = job.lookup_worker
                self.logger.info("[%d] START => %s (connection)", job.id,
                                 worker_host.hostname)
            else:
                device = select_device(job, self.dispatchers)
                if not device:
                    return False
                # selecting device can change the job
                job = TestJob.objects.get(id=job.id)
                self.logger.info("[%d] Assigning %s device", job.id, device)
                if job.actual_device is None:
                    device = job.requested_device
                    if not device.worker_host:
                        msg = "Infrastructure error: Invalid worker information"
                        self.logger.error("[%d] %s", job.id, msg)
                        fail_job(job, msg, TestJob.INCOMPLETE)
                        return False

                    # Launch the job
                    create_job(job, device)
                    self.logger.info("[%d] START => %s (%s)", job.id,
                                     device.worker_host.hostname, device.hostname)
                    worker_host = device.worker_host
                else:
                    device = job.actual_device
                    if not device.worker_host:
                        msg = "Infrastructure error: Invalid worker information"
                        self.logger.error("[%d] %s", job.id, msg)
                        fail_job(job, msg, TestJob.INCOMPLETE)
                        return False
                    self.logger.info("[%d] START => %s (%s) (retrying)", job.id,
                                     device.worker_host.hostname, device.hostname)
                    worker_host = device.worker_host
            try:
                # Load job definition to get the variables for template
                # rendering
                job_def = yaml.load(job.definition)
                job_ctx = job_def.get('context', {})

                # Load device configuration
                device_configuration = None \
                    if job.dynamic_connection else device.load_device_configuration(job_ctx)

                if job.is_multinode:
                    for group_job in job.sub_jobs_list:
                        if group_job.dynamic_connection:
                            # to get this far, the rest of the multinode group must also be ready
                            # so start the dynamic connections
                            # FIXME: rationalise and streamline
                            self.controler.send_multipart(
                                [str(worker_host.hostname),
                                 'START', str(group_job.id), self.export_definition(group_job),
                                 str(device_configuration),
                                 get_env_string(options['env']),
                                 get_env_string(options['env_dut'])])

                self.controler.send_multipart(
                    [str(worker_host.hostname),
                     'START', str(job.id), self.export_definition(job),
                     str(device_configuration),
                     get_env_string(options['env']), get_env_string(options['env_dut'])])

            except (jinja2.TemplateError, IOError, yaml.YAMLError) as exc:
                if isinstance(exc, jinja2.TemplateNotFound):
                    self.logger.error("Template not found: '%s'", exc.message)
                    msg = "Infrastructure error: Template not found: '%s'" % \
                          exc.message
                elif isinstance(exc, jinja2.TemplateSyntaxError):
                    self.logger.error("Template syntax error in '%s', line %d: %s",
                                      exc.name, exc.lineno, exc.message)
                    msg = "Infrastructure error: Template syntax error in '%s', line %d: %s" % \
                          (exc.name, exc.lineno, exc.message)
                elif isinstance(exc, IOError):
                    self.logger.error("Unable to read '%s': %s",
                                      options['env'], exc.strerror)
                    msg = "Infrastructure error: cannot open '%s': %s" % \
                          (options['env'], exc.strerror)
                elif isinstance(exc, yaml.YAMLError):
                    self.logger.error("Unable to parse job definition: %s",
                                      exc)
                    msg = "Infrastructure error: cannot parse job definition: %s" % \
                          exc
                else:
                    self.logger.exception(exc)
                    msg = "Infrastructure error: %s" % exc.message

                self.logger.error("[%d] INCOMPLETE job", job.id)
                job.status = TestJob.INCOMPLETE
                if job.dynamic_connection:
                    job.failure_comment = msg
                    job.save()
                else:
                    new_status = Device.IDLE
                    device.state_transition_to(
                        new_status,
                        message=msg,
                        job=job)
                    device.status = new_status
                    device.current_job = None
                    job.failure_comment = msg
                    job.save()
                    device.save()
        return True