def test_compatibility(self): # pylint: disable=too-many-locals user = self.factory.make_user() # public set in the YAML yaml_str = self.factory.make_job_json() yaml_data = yaml.load(yaml_str) job = TestJob.from_yaml_and_user( yaml_str, user) self.assertTrue(job.is_public) self.assertTrue(job.can_view(user)) # initial state prior to validation self.assertEqual(job.pipeline_compatibility, 0) self.assertNotIn('compatibility', yaml_data) # FIXME: dispatcher master needs to make this kind of test more accessible. definition = yaml.load(job.definition) self.assertNotIn('protocols', definition) job.actual_device = Device.objects.get(hostname='fakeqemu1') job_def = yaml.load(job.definition) job_ctx = job_def.get('context', {}) parser = JobParser() device = job.actual_device try: device_config = device.load_device_configuration(job_ctx, system=False) # raw dict except (jinja2.TemplateError, yaml.YAMLError, IOError) as exc: # FIXME: report the exceptions as useful user messages self.fail("[%d] jinja2 error: %s" % (job.id, exc)) if not device_config or not isinstance(device_config, dict): # it is an error to have a pipeline device without a device dictionary as it will never get any jobs. msg = "Administrative error. Device '%s' has no device dictionary." % device.hostname self.fail('[%d] device-dictionary error: %s' % (job.id, msg)) device_object = PipelineDevice(device_config, device.hostname) # equivalent of the NewDevice in lava-dispatcher, without .yaml file. # FIXME: drop this nasty hack once 'target' is dropped as a parameter if 'target' not in device_object: device_object.target = device.hostname device_object['hostname'] = device.hostname parser_device = device_object try: # pass (unused) output_dir just for validation as there is no zmq socket either. pipeline_job = parser.parse( job.definition, parser_device, job.id, None, None, None, output_dir=job.output_dir) except (AttributeError, JobError, NotImplementedError, KeyError, TypeError) as exc: self.fail('[%s] parser error: %s' % (job.sub_id, exc)) description = pipeline_job.describe() self.assertIn('compatibility', description) self.assertGreaterEqual(description['compatibility'], BootQEMU.compatibility)
def test_invalid_multinode(self): # pylint: disable=too-many-locals user = self.factory.make_user() device_type = self.factory.make_device_type() submission = yaml.load(open( os.path.join(os.path.dirname(__file__), 'kvm-multinode.yaml'), 'r')) tag_list = [ self.factory.ensure_tag('usb-flash'), self.factory.ensure_tag('usb-eth') ] self.factory.make_device(device_type, 'fakeqemu1') self.factory.make_device(device_type, 'fakeqemu2') self.factory.make_device(device_type, 'fakeqemu3', tags=tag_list) deploy = [action['deploy'] for action in submission['actions'] if 'deploy' in action] # replace working image with a broken URL for block in deploy: block['images'] = { 'rootfs': { 'url': 'http://localhost/unknown/invalid.gz', 'image_arg': '{rootfs}' } } job_object_list = _pipeline_protocols(submission, user, yaml.dump(submission)) self.assertEqual(len(job_object_list), 2) self.assertEqual( job_object_list[0].sub_id, "%d.%d" % (int(job_object_list[0].id), 0)) # FIXME: dispatcher master needs to make this kind of test more accessible. for job in job_object_list: definition = yaml.load(job.definition) self.assertNotEqual(definition['protocols']['lava-multinode']['sub_id'], '') job.actual_device = Device.objects.get(hostname='fakeqemu1') job_def = yaml.load(job.definition) job_ctx = job_def.get('context', {}) parser = JobParser() device = None device_object = None if not job.dynamic_connection: device = job.actual_device try: device_config = device.load_device_configuration(job_ctx, system=False) # raw dict except (jinja2.TemplateError, yaml.YAMLError, IOError) as exc: # FIXME: report the exceptions as useful user messages self.fail("[%d] jinja2 error: %s" % (job.id, exc)) if not device_config or not isinstance(device_config, dict): # it is an error to have a pipeline device without a device dictionary as it will never get any jobs. msg = "Administrative error. Device '%s' has no device dictionary." % device.hostname self.fail('[%d] device-dictionary error: %s' % (job.id, msg)) device_object = PipelineDevice(device_config, device.hostname) # equivalent of the NewDevice in lava-dispatcher, without .yaml file. # FIXME: drop this nasty hack once 'target' is dropped as a parameter if 'target' not in device_object: device_object.target = device.hostname device_object['hostname'] = device.hostname validate_list = job.sub_jobs_list if job.is_multinode else [job] for check_job in validate_list: parser_device = None if job.dynamic_connection else device_object try: # pass (unused) output_dir just for validation as there is no zmq socket either. pipeline_job = parser.parse( check_job.definition, parser_device, check_job.id, None, None, None, output_dir=check_job.output_dir) except (AttributeError, JobError, NotImplementedError, KeyError, TypeError) as exc: self.fail('[%s] parser error: %s' % (check_job.sub_id, exc)) with TestCase.assertRaises(self, (JobError, InfrastructureError)) as check: pipeline_job.pipeline.validate_actions() check_missing_path(self, check, 'qemu-system-x86_64') for job in job_object_list: job = TestJob.objects.get(id=job.id) self.assertNotEqual(job.sub_id, '')
def select_device(job): """ Transitioning a device from Idle to Reserved is the responsibility of the scheduler_daemon (currently). This function just checks that the reserved device is valid for this job. Jobs will only enter this function if a device is already reserved for that job. Stores the pipeline description To prevent cycling between lava_scheduler_daemon:assign_jobs and here, if a job fails validation, the job is incomplete. Issues with this need to be fixed using device tags. """ logger = logging.getLogger('dispatcher-master') if not job.dynamic_connection: if not job.actual_device: return None if job.actual_device.status is not Device.RESERVED: # should not happen logger.error("[%d] device [%s] not in reserved state", job.id, job.actual_device) return None if job.actual_device.worker_host is None: fail_msg = "Misconfigured device configuration for %s - missing worker_host" % job.actual_device fail_job(job, fail_msg=fail_msg) logger.error(fail_msg) if job.is_multinode: # inject the actual group hostnames into the roles for the dispatcher to populate in the overlay. devices = {} for multinode_job in job.sub_jobs_list: # build a list of all devices in this group definition = yaml.load(multinode_job.definition) # devices are not necessarily assigned to all jobs in a group at the same time # check all jobs in this multinode group before allowing any to start. if multinode_job.dynamic_connection: logger.debug("[%s] dynamic connection job", multinode_job.sub_id) continue if not multinode_job.actual_device: logger.debug("[%s] job has no device yet", multinode_job.sub_id) return None devices[str(multinode_job.actual_device.hostname )] = definition['protocols']['lava-multinode']['role'] for multinode_job in job.sub_jobs_list: # apply the complete list to all jobs in this group definition = yaml.load(multinode_job.definition) definition['protocols']['lava-multinode']['roles'] = devices multinode_job.definition = yaml.dump(definition) multinode_job.save() # Load job definition to get the variables for template rendering job_def = yaml.load(job.definition) job_ctx = job_def.get('context', {}) parser = JobParser() device = None device_object = None if not job.dynamic_connection: device = job.actual_device try: device_config = device.load_device_configuration( job_ctx) # raw dict except (jinja2.TemplateError, yaml.YAMLError, IOError) as exc: # FIXME: report the exceptions as useful user messages logger.error("[%d] jinja2 error: %s" % (job.id, exc)) return None if not device_config or type(device_config) is not dict: # it is an error to have a pipeline device without a device dictionary as it will never get any jobs. msg = "Administrative error. Device '%s' has no device dictionary." % device.hostname logger.error('[%d] device-dictionary error: %s' % (job.id, msg)) # as we don't control the scheduler, yet, this has to be an error and an incomplete job. # the scheduler_daemon sorts by a fixed order, so this would otherwise just keep on repeating. fail_job(job, fail_msg=msg) return None device_object = PipelineDevice( device_config, device.hostname ) # equivalent of the NewDevice in lava-dispatcher, without .yaml file. # FIXME: drop this nasty hack once 'target' is dropped as a parameter if 'target' not in device_object: device_object.target = device.hostname device_object['hostname'] = device.hostname validate_list = job.sub_jobs_list if job.is_multinode else [job] for check_job in validate_list: parser_device = None if job.dynamic_connection else device_object try: logger.debug("[%d] parsing definition" % check_job.id) # pass (unused) output_dir just for validation as there is no zmq socket either. pipeline_job = parser.parse(check_job.definition, parser_device, check_job.id, None, output_dir=check_job.output_dir) except (AttributeError, JobError, NotImplementedError, KeyError, TypeError) as exc: logger.error('[%d] parser error: %s' % (check_job.id, exc)) fail_job(check_job, fail_msg=exc) return None try: logger.debug("[%d] validating actions" % check_job.id) pipeline_job.pipeline.validate_actions() except (AttributeError, JobError, KeyError, TypeError) as exc: logger.error({device: exc}) fail_job(check_job, fail_msg=exc) return None if pipeline_job: pipeline = pipeline_job.describe() # write the pipeline description to the job output directory. if not os.path.exists(check_job.output_dir): os.makedirs(check_job.output_dir) with open(os.path.join(check_job.output_dir, 'description.yaml'), 'w') as describe_yaml: describe_yaml.write(yaml.dump(pipeline)) map_metadata(yaml.dump(pipeline), job) return device
def select_device(job, dispatchers): """ Transitioning a device from Idle to Reserved is the responsibility of the scheduler_daemon (currently). This function just checks that the reserved device is valid for this job. Jobs will only enter this function if a device is already reserved for that job. Stores the pipeline description To prevent cycling between lava_scheduler_daemon:assign_jobs and here, if a job fails validation, the job is incomplete. Issues with this need to be fixed using device tags. """ # FIXME: split out dynamic_connection, multinode and validation logger = logging.getLogger('dispatcher-master') if not job.dynamic_connection: if not job.actual_device: return None if job.actual_device.status is not Device.RESERVED: # should not happen logger.error("[%d] device [%s] not in reserved state", job.id, job.actual_device) return None if job.actual_device.worker_host is None: fail_msg = "Misconfigured device configuration for %s - missing worker_host" % job.actual_device fail_job(job, fail_msg=fail_msg) logger.error(fail_msg) return None if job.is_multinode: # inject the actual group hostnames into the roles for the dispatcher to populate in the overlay. devices = {} for multinode_job in job.sub_jobs_list: # build a list of all devices in this group definition = yaml.load(multinode_job.definition) # devices are not necessarily assigned to all jobs in a group at the same time # check all jobs in this multinode group before allowing any to start. if multinode_job.dynamic_connection: logger.debug("[%s] dynamic connection job", multinode_job.sub_id) continue if not multinode_job.actual_device: logger.debug("[%s] job has no device yet", multinode_job.sub_id) return None devices[str(multinode_job.actual_device.hostname)] = definition['protocols']['lava-multinode']['role'] for multinode_job in job.sub_jobs_list: # apply the complete list to all jobs in this group definition = yaml.load(multinode_job.definition) definition['protocols']['lava-multinode']['roles'] = devices multinode_job.definition = yaml.dump(definition) multinode_job.save() # Load job definition to get the variables for template rendering job_def = yaml.load(job.definition) job_ctx = job_def.get('context', {}) parser = JobParser() device = None device_object = None if not job.dynamic_connection: device = job.actual_device try: device_config = device.load_device_configuration(job_ctx) # raw dict except (jinja2.TemplateError, yaml.YAMLError, IOError) as exc: logger.error("[%d] jinja2 error: %s" % (job.id, exc)) msg = "Administrative error. Unable to parse '%s'" % exc fail_job(job, fail_msg=msg) return None if not device_config or type(device_config) is not dict: # it is an error to have a pipeline device without a device dictionary as it will never get any jobs. msg = "Administrative error. Device '%s' has no device dictionary." % device.hostname logger.error('[%d] device-dictionary error: %s' % (job.id, msg)) # as we don't control the scheduler, yet, this has to be an error and an incomplete job. # the scheduler_daemon sorts by a fixed order, so this would otherwise just keep on repeating. fail_job(job, fail_msg=msg) return None if not device.worker_host or not device.worker_host.hostname: msg = "Administrative error. Device '%s' has no worker host." % device.hostname logger.error('[%d] worker host error: %s', job.id, msg) fail_job(job, fail_msg=msg) return None if device.worker_host.hostname not in dispatchers: # a configured worker has not called in to this master # likely that the worker is misconfigured - polling the wrong master # or simply not running at all. msg = """Administrative error. Device '{0}' has a worker_host setting of '{1}' but no slave has registered with this master using that FQDN.""".format(device.hostname, device.worker_host.hostname) logger.error('[%d] worker-hostname error: %s', job.id, msg) fail_job(job, fail_msg=msg) return None device_object = PipelineDevice(device_config, device.hostname) # equivalent of the NewDevice in lava-dispatcher, without .yaml file. # FIXME: drop this nasty hack once 'target' is dropped as a parameter if 'target' not in device_object: device_object.target = device.hostname device_object['hostname'] = device.hostname validate_list = job.sub_jobs_list if job.is_multinode else [job] for check_job in validate_list: parser_device = None if job.dynamic_connection else device_object try: logger.info("[%d] Parsing definition" % check_job.id) # pass (unused) output_dir just for validation as there is no zmq socket either. pipeline_job = parser.parse( check_job.definition, parser_device, check_job.id, None, output_dir=check_job.output_dir) except (AttributeError, JobError, NotImplementedError, KeyError, TypeError) as exc: logger.error('[%d] parser error: %s' % (check_job.id, exc)) fail_job(check_job, fail_msg=exc) return None try: logger.info("[%d] Validating actions" % check_job.id) pipeline_job.pipeline.validate_actions() except (AttributeError, JobError, KeyError, TypeError) as exc: logger.error({device: exc}) fail_job(check_job, fail_msg=exc) return None if pipeline_job: pipeline = pipeline_job.describe() # write the pipeline description to the job output directory. if not os.path.exists(check_job.output_dir): os.makedirs(check_job.output_dir) with open(os.path.join(check_job.output_dir, 'description.yaml'), 'w') as describe_yaml: describe_yaml.write(yaml.dump(pipeline)) map_metadata(yaml.dump(pipeline), job) # add the compatibility result from the master to the definition for comparison on the slave. if 'compatibility' in pipeline: try: compat = int(pipeline['compatibility']) except ValueError: logger.error("[%d] Unable to parse job compatibility: %s", check_job.id, pipeline['compatibility']) compat = 0 check_job.pipeline_compatibility = compat check_job.save(update_fields=['pipeline_compatibility']) else: logger.error("[%d] Unable to identify job compatibility.", check_job.id) fail_job(check_job, fail_msg='Unknown compatibility') return None return device
def select_device(job, dispatchers): # pylint: disable=too-many-return-statements """ Transitioning a device from Idle to Reserved is the responsibility of the scheduler_daemon (currently). This function just checks that the reserved device is valid for this job. Jobs will only enter this function if a device is already reserved for that job. Stores the pipeline description To prevent cycling between lava_scheduler_daemon:assign_jobs and here, if a job fails validation, the job is incomplete. Issues with this need to be fixed using device tags. """ # FIXME: split out dynamic_connection, multinode and validation logger = logging.getLogger('dispatcher-master') if not job.dynamic_connection: if not job.actual_device: return None if job.actual_device.status is not Device.RESERVED: # should not happen logger.error("[%d] device [%s] not in reserved state", job.id, job.actual_device) return None if job.actual_device.worker_host is None: fail_msg = "Misconfigured device configuration for %s - missing worker_host" % job.actual_device fail_job(job, fail_msg=fail_msg) logger.error(fail_msg) return None if job.is_multinode: # inject the actual group hostnames into the roles for the dispatcher to populate in the overlay. devices = {} for multinode_job in job.sub_jobs_list: # build a list of all devices in this group definition = yaml.load(multinode_job.definition) # devices are not necessarily assigned to all jobs in a group at the same time # check all jobs in this multinode group before allowing any to start. if multinode_job.dynamic_connection: logger.debug("[%s] dynamic connection job", multinode_job.sub_id) continue if not multinode_job.actual_device: logger.debug("[%s] job has no device yet", multinode_job.sub_id) return None devices[str(multinode_job.actual_device.hostname)] = definition['protocols']['lava-multinode']['role'] for multinode_job in job.sub_jobs_list: # apply the complete list to all jobs in this group definition = yaml.load(multinode_job.definition) definition['protocols']['lava-multinode']['roles'] = devices multinode_job.definition = yaml.dump(definition) multinode_job.save() # Load job definition to get the variables for template rendering job_def = yaml.load(job.definition) job_ctx = job_def.get('context', {}) device = None if not job.dynamic_connection: device = job.actual_device try: device_config = device.load_device_configuration(job_ctx) # raw dict except (jinja2.TemplateError, yaml.YAMLError, IOError) as exc: logger.error("[%d] jinja2 error: %s", job.id, exc) msg = "Administrative error. Unable to parse device configuration: '%s'" % exc fail_job(job, fail_msg=msg) return None if not device_config or not isinstance(device_config, dict): # it is an error to have a pipeline device without a device dictionary as it will never get any jobs. msg = "Administrative error. Device '%s' has no device dictionary." % device.hostname logger.error('[%d] device-dictionary error: %s', job.id, msg) # as we don't control the scheduler, yet, this has to be an error and an incomplete job. # the scheduler_daemon sorts by a fixed order, so this would otherwise just keep on repeating. fail_job(job, fail_msg=msg) return None if not device.worker_host or not device.worker_host.hostname: msg = "Administrative error. Device '%s' has no worker host." % device.hostname logger.error('[%d] worker host error: %s', job.id, msg) fail_job(job, fail_msg=msg) return None if device.worker_host.hostname not in dispatchers: # A configured worker has not (yet) called in to this master. # It is likely that the worker is misconfigured - polling the wrong master # or simply not running at all. There is also a possible race condition # here when the master gets restarted with a queue of jobs and has not yet # received polls from all slaves, so do not fail the job. msg = "Device '{0}' has a worker_host setting of " \ "'{1}' but no slave has yet registered with this master " \ "using that FQDN.".format(device.hostname, device.worker_host.hostname) logger.info('[%d] worker-hostname not seen: %s', job.id, msg) return None device_object = PipelineDevice(device_config, device.hostname) # equivalent of the NewDevice in lava-dispatcher, without .yaml file. # FIXME: drop this nasty hack once 'target' is dropped as a parameter if 'target' not in device_object: device_object.target = device.hostname device_object['hostname'] = device.hostname return device
def select_device(job): """ Transitioning a device from Idle to Reserved is the responsibility of the scheduler_daemon (currently). This function just checks that the reserved device is valid for this job. Jobs will only enter this function if a device is already reserved for that job. Storse the pipeline description To prevent cycling between lava_scheduler_daemon:assign_jobs and here, if a job fails validation, the job is incomplete. Issues with this need to be fixed using device tags. """ logger = logging.getLogger('dispatcher-master') if not job.actual_device: # should not happen. logger.error("[%d] no device reserved", job.id) return None if job.actual_device.status is not Device.RESERVED: # should not happen logger.error("[%d] device [%s] not in reserved state", job.id, job.actual_device) return None if job.actual_device.worker_host is None: fail_msg = "Misconfigured device configuration for %s - missing worker_host" % job.actual_device end_job(job, fail_msg=fail_msg, job_status=TestJob.INCOMPLETE) logger.error(fail_msg) if job.is_multinode: # inject the actual group hostnames into the roles for the dispatcher to populate in the overlay. devices = {} for multinode_job in job.sub_jobs_list: # build a list of all devices in this group definition = yaml.load(multinode_job.definition) # devices are not necessarily assigned to all jobs in a group at the same time # check all jobs in this multinode group before allowing any to start. if not multinode_job.actual_device: logger.debug("[%s] job has no device yet", multinode_job.sub_id) return None devices[str(multinode_job.actual_device.hostname)] = definition['protocols']['lava-multinode']['role'] for multinode_job in job.sub_jobs_list: # apply the complete list to all jobs in this group definition = yaml.load(multinode_job.definition) definition['protocols']['lava-multinode']['roles'] = devices multinode_job.definition = yaml.dump(definition) multinode_job.save() # Load job definition to get the variables for template rendering job_def = yaml.load(job.definition) job_ctx = job_def.get('context', {}) device = job.actual_device try: device_config = device.load_device_configuration(job_ctx) # raw dict except (jinja2.TemplateError, yaml.YAMLError, IOError) as exc: # FIXME: report the exceptions as useful user messages logger.error({'jinja2': exc}) return None if not device_config or type(device_config) is not dict: # it is an error to have a pipeline device without a device dictionary as it will never get any jobs. msg = "Administrative error. Device '%s' has no device dictionary." % device.hostname logger.error({'device-dictionary': msg}) # as we don't control the scheduler, yet, this has to be an error and an incomplete job. # the scheduler_daemon sorts by a fixed order, so this would otherwise just keep on repeating. end_job(job, fail_msg=msg, job_status=TestJob.INCOMPLETE) return None parser = JobParser() obj = PipelineDevice(device_config, device.hostname) # equivalent of the NewDevice in lava-dispatcher, without .yaml file. # FIXME: drop this nasty hack once 'target' is dropped as a parameter if 'target' not in obj: obj.target = device.hostname obj['hostname'] = device.hostname # pass (unused) output_dir just for validation as there is no zmq socket either. try: pipeline_job = parser.parse(job.definition, obj, job.id, None, output_dir='/tmp') except (JobError, AttributeError, NotImplementedError, KeyError, TypeError) as exc: logger.error({'parser': exc}) end_job(job, fail_msg=exc, job_status=TestJob.INCOMPLETE) return None try: pipeline_job.pipeline.validate_actions() except (AttributeError, JobError, KeyError, TypeError) as exc: logger.error({device: exc}) end_job(job, fail_msg=exc, job_status=TestJob.INCOMPLETE) return None if pipeline_job: pipeline = pipeline_job.describe() # write the pipeline description to the job output directory. if not os.path.exists(job.output_dir): os.makedirs(job.output_dir) with open(os.path.join(job.output_dir, 'description.yaml'), 'w') as describe_yaml: describe_yaml.write(yaml.dump(pipeline)) map_metadata(yaml.dump(pipeline), job) return device
def test_invalid_multinode(self): user = self.factory.make_user() self.device_type = self.factory.make_device_type() submission = yaml.load( open(os.path.join(os.path.dirname(__file__), 'kvm-multinode.yaml'), 'r')) tag_list = [ self.factory.ensure_tag('usb-flash'), self.factory.ensure_tag('usb-eth') ] self.factory.make_device(self.device_type, 'fakeqemu1') self.factory.make_device(self.device_type, 'fakeqemu2') self.factory.make_device(self.device_type, 'fakeqemu3', tags=tag_list) deploy = [ action['deploy'] for action in submission['actions'] if 'deploy' in action ] # replace working image with a broken URL for block in deploy: block['image'] = 'http://localhost/unknown/invalid.gz' job_object_list = _pipeline_protocols(submission, user, yaml.dump(submission)) self.assertEqual(len(job_object_list), 2) self.assertEqual(job_object_list[0].sub_id, "%d.%d" % (int(job_object_list[0].id), 0)) # FIXME: dispatcher master needs to make this kind of test more accessible. for job in job_object_list: definition = yaml.load(job.definition) self.assertNotEqual( definition['protocols']['lava-multinode']['sub_id'], '') job.actual_device = Device.objects.get(hostname='fakeqemu1') job_def = yaml.load(job.definition) job_ctx = job_def.get('context', {}) parser = JobParser() device = None device_object = None if not job.dynamic_connection: device = job.actual_device try: device_config = device.load_device_configuration( job_ctx) # raw dict except (jinja2.TemplateError, yaml.YAMLError, IOError) as exc: # FIXME: report the exceptions as useful user messages self.fail("[%d] jinja2 error: %s" % (job.id, exc)) if not device_config or type(device_config) is not dict: # it is an error to have a pipeline device without a device dictionary as it will never get any jobs. msg = "Administrative error. Device '%s' has no device dictionary." % device.hostname self.fail('[%d] device-dictionary error: %s' % (job.id, msg)) device_object = PipelineDevice( device_config, device.hostname ) # equivalent of the NewDevice in lava-dispatcher, without .yaml file. # FIXME: drop this nasty hack once 'target' is dropped as a parameter if 'target' not in device_object: device_object.target = device.hostname device_object['hostname'] = device.hostname validate_list = job.sub_jobs_list if job.is_multinode else [job] for check_job in validate_list: parser_device = None if job.dynamic_connection else device_object try: # pass (unused) output_dir just for validation as there is no zmq socket either. pipeline_job = parser.parse( check_job.definition, parser_device, check_job.id, None, output_dir=check_job.output_dir) except (AttributeError, JobError, NotImplementedError, KeyError, TypeError) as exc: self.fail('[%s] parser error: %s' % (check_job.sub_id, exc)) if os.path.exists( '/dev/loop0' ): # rather than skipping the entire test, just the validation. self.assertRaises(JobError, pipeline_job.pipeline.validate_actions) for job in job_object_list: job = TestJob.objects.get(id=job.id) self.assertNotEqual(job.sub_id, '')
def select_device(job, dispatchers): # pylint: disable=too-many-return-statements """ Transitioning a device from Idle to Reserved is the responsibility of the scheduler_daemon (currently). This function just checks that the reserved device is valid for this job. Jobs will only enter this function if a device is already reserved for that job. Stores the pipeline description To prevent cycling between lava_scheduler_daemon:assign_jobs and here, if a job fails validation, the job is incomplete. Issues with this need to be fixed using device tags. """ # FIXME: split out dynamic_connection, multinode and validation logger = logging.getLogger('dispatcher-master') if not job.dynamic_connection: if not job.actual_device: return None if job.actual_device.status is not Device.RESERVED: # should not happen logger.error("[%d] device [%s] not in reserved state", job.id, job.actual_device) return None if job.actual_device.worker_host is None: fail_msg = "Misconfigured device configuration for %s - missing worker_host" % job.actual_device fail_job(job, fail_msg=fail_msg) logger.error(fail_msg) return None if job.is_multinode: # inject the actual group hostnames into the roles for the dispatcher to populate in the overlay. devices = {} for multinode_job in job.sub_jobs_list: # build a list of all devices in this group definition = yaml.load(multinode_job.definition) # devices are not necessarily assigned to all jobs in a group at the same time # check all jobs in this multinode group before allowing any to start. if multinode_job.dynamic_connection: logger.debug("[%s] dynamic connection job", multinode_job.sub_id) continue if not multinode_job.actual_device: logger.debug("[%s] job has no device yet", multinode_job.sub_id) return None devices[str(multinode_job.actual_device.hostname )] = definition['protocols']['lava-multinode']['role'] for multinode_job in job.sub_jobs_list: # apply the complete list to all jobs in this group definition = yaml.load(multinode_job.definition) definition['protocols']['lava-multinode']['roles'] = devices multinode_job.definition = yaml.dump(definition) multinode_job.save() # Load job definition to get the variables for template rendering job_def = yaml.load(job.definition) job_ctx = job_def.get('context', {}) device = None if not job.dynamic_connection: device = job.actual_device try: device_config = device.load_device_configuration( job_ctx) # raw dict except (jinja2.TemplateError, yaml.YAMLError, IOError) as exc: logger.error("[%d] jinja2 error: %s", job.id, exc) msg = "Administrative error. Unable to parse device configuration: '%s'" % exc fail_job(job, fail_msg=msg) return None if not device_config or not isinstance(device_config, dict): # it is an error to have a pipeline device without a device dictionary as it will never get any jobs. msg = "Administrative error. Device '%s' has no device dictionary." % device.hostname logger.error('[%d] device-dictionary error: %s', job.id, msg) # as we don't control the scheduler, yet, this has to be an error and an incomplete job. # the scheduler_daemon sorts by a fixed order, so this would otherwise just keep on repeating. fail_job(job, fail_msg=msg) return None if not device.worker_host or not device.worker_host.hostname: msg = "Administrative error. Device '%s' has no worker host." % device.hostname logger.error('[%d] worker host error: %s', job.id, msg) fail_job(job, fail_msg=msg) return None if device.worker_host.hostname not in dispatchers: # A configured worker has not (yet) called in to this master. # It is likely that the worker is misconfigured - polling the wrong master # or simply not running at all. There is also a possible race condition # here when the master gets restarted with a queue of jobs and has not yet # received polls from all slaves, so do not fail the job. msg = "Device '{0}' has a worker_host setting of " \ "'{1}' but no slave has yet registered with this master " \ "using that FQDN.".format(device.hostname, device.worker_host.hostname) logger.info('[%d] worker-hostname not seen: %s', job.id, msg) return None device_object = PipelineDevice( device_config, device.hostname ) # equivalent of the NewDevice in lava-dispatcher, without .yaml file. # FIXME: drop this nasty hack once 'target' is dropped as a parameter if 'target' not in device_object: device_object.target = device.hostname device_object['hostname'] = device.hostname return device