def test_calls_where_node_has_no_gpus(self): node_id = 7 job_id = 10 gpu_count = 2 required_gpus = 2 GPUManager.define_node_gpus(node_id, gpu_count) node_id = 8 self.assertFalse( GPUManager.reserve_gpus_for_job(node_id, required_gpus)) self.assertFalse( GPUManager.assign_gpus_for_job(node_id, job_id, required_gpus)) nvidia_label = GPUManager.get_nvidia_docker_label(node_id, job_id) self.assertEqual(nvidia_label, "")
def accept_new_job_exe(self, job_exe): """Asks the node if it can accept the given new job execution :param job_exe: The new job execution :type job_exe: :class:`queue.job_exe.QueuedJobExecution` :returns: True if the new job execution was accepted, False otherwise :rtype: bool """ if not self.is_ready_for_new_job: return False resources = job_exe.required_resources if self._remaining_resources.is_sufficient_to_meet(resources): if resources.gpus > 0: if not GPUManager.reserve_gpus_for_job(self.node_id, int(resources.gpus)): return False self._allocated_queued_job_exes.append(job_exe) self.allocated_resources.add(resources) self._remaining_resources.subtract(resources) job_exe.scheduled(self.agent_id, self.node_id, resources) return True return False
def __init__(self, agent_id, node, tasks, running_job_exes, resource_set): """Constructor :param agent_id: The agent ID :type agent_id: string :param node: The node :type node: :class:`scheduler.node.node_class.Node` :param tasks: The current tasks running on the node :type tasks: :func:`list` :param running_job_exes: The current job executions running on the node :type running_job_exes: :func:`list` :param resource_set: The set of resources for the node :type resource_set: :class:`scheduler.resources.agent.ResourceSet` """ self.agent_id = agent_id # Set agent ID separately from node since it can change during scheduling self.hostname = node.hostname self.node_id = node.id self.is_ready_for_new_job = node.is_ready_for_new_job( ) # Cache this for consistency self.is_ready_for_next_job_task = node.is_ready_for_next_job_task( ) # Cache this for consistency self.is_ready_for_system_task = node.is_ready_for_system_task( ) # Cache this for consistency self.allocated_offers = [] self.allocated_resources = NodeResources() self.allocated_tasks = [ ] # Tasks that have been allocated resources from this node self._node = node self._allocated_queued_job_exes = [ ] # New queued job executions that have been allocated resources self._allocated_running_job_exes = [ ] # Running job executions that have been allocated resources self._running_job_exes = running_job_exes self._running_tasks = tasks self._offered_resources = NodeResources( ) # The amount of resources that were originally offered self._offered_resources.add(resource_set.offered_resources) self._remaining_resources = NodeResources() self._remaining_resources.add(self._offered_resources) self._task_resources = resource_set.task_resources self._watermark_resources = resource_set.watermark_resources if int(resource_set.offered_resources.gpus) > 0: GPUManager.define_node_gpus( self.node_id, int(resource_set.offered_resources.gpus))
def test_add_less_gpu(self): node_id = 2 gpu_count = 3 GPUManager.define_node_gpus(node_id, gpu_count) self.assertEqual(GPUManager.get_gpu_count_for_node(node_id), 3) gpu_count = 1 GPUManager.define_node_gpus(node_id, gpu_count) self.assertEqual(GPUManager.get_gpu_count_for_node(node_id), 3)
def test_reserve_gpu(self): node_id = 4 gpu_count = 2 required_gpus = 2 GPUManager.define_node_gpus(node_id, gpu_count) self.assertTrue(GPUManager.reserve_gpus_for_job( node_id, required_gpus)) job_id = 11 self.assertFalse( GPUManager.reserve_gpus_for_job(node_id, required_gpus)) gpu_count = 4 GPUManager.define_node_gpus(node_id, gpu_count) self.assertTrue(GPUManager.reserve_gpus_for_job( node_id, required_gpus))
def create_job_exe_model(self, framework_id, when): """Creates and returns a scheduled job execution model :param framework_id: The scheduling framework ID :type framework_id: string :param when: The start time :type when: :class:`datetime.datetime` :returns: The job execution model :rtype: :class:`job.models.JobExecution` """ job_exe = JobExecution() job_exe.job_id = self._queue.job_id job_exe.job_type_id = self._queue.job_type_id job_exe.recipe_id = self._queue.recipe_id job_exe.batch_id = self._queue.batch_id job_exe.exe_num = self._queue.exe_num job_exe.timeout = self._queue.timeout job_exe.docker_image = self._queue.docker_image job_exe.input_file_size = self._queue.input_file_size job_exe.configuration = self.configuration.get_dict() job_exe.queued = self._queue.queued if self.is_canceled: job_exe.node_id = None job_exe.resources = NodeResources().get_json().get_dict() job_exe.started = None else: job_exe.node_id = self._scheduled_node_id job_exe.resources = self._scheduled_resources.get_json().get_dict() job_exe.started = when job_exe.set_cluster_id(framework_id, self._queue.job_id, self._queue.exe_num) if self.required_resources.gpus > 0: if not GPUManager.assign_gpus_for_job(job_exe.node_id,job_exe.job_id, self.required_resources.gpus): logger.error("Job %s was unable to assign %s reserved GPUs on node %s. Note: this is not supposed to be able to happen. something has gone horribly wrong.", job_exe.job_id, self.required_resources.gpus, job_exe.node_id) return job_exe
def update(self, status): """ Invoked when the status of a task has changed (e.g., a slave is lost and so the task is lost, a task finishes and an executor sends a status update saying so, etc.) Note that returning from this callback acknowledges receipt of this status update. If for whatever reason the scheduler aborts during this callback (or the process exits) another status update will be delivered. Note, however, that this is currently not true if the slave sending the status update is lost or fails during that time. """ started = now() model = utils.create_task_update_model(status) mesos_status = model.status task_update = TaskStatusUpdate(model, utils.get_status_agent_id(status), utils.get_status_data(status)) task_id = task_update.task_id was_task_finished = task_update.status in TaskStatusUpdate.TERMINAL_STATUSES was_job_finished = False if mesos_status == 'TASK_ERROR': logger.error('Status update for task %s: %s', task_id, mesos_status) if mesos_status == 'TASK_LOST': logger.warning('Status update for task %s: %s', task_id, mesos_status) else: logger.info('Status update for task %s: %s', task_id, mesos_status) # Since we have a status update for this task, remove it from reconciliation set recon_mgr.remove_task_id(task_id) # Hand off task update to be saved in the database if task_id.startswith(JOB_TASK_ID_PREFIX): # Grab job execution ID from manager cluster_id = JobExecution.parse_cluster_id(task_id) job_exe = job_exe_mgr.get_running_job_exe(cluster_id) if job_exe: model.job_exe_id = job_exe.id task_update_mgr.add_task_update(model) # Update task with latest status # This should happen before the job execution or node manager are updated, since they will assume that the task # has already been updated task_mgr.handle_task_update(task_update) if task_id.startswith(JOB_TASK_ID_PREFIX): # Job task, so update the job execution try: job_exe = job_exe_mgr.handle_task_update(task_update) if job_exe and job_exe.is_finished(): logger.info("job_exe with job id %s and node id %s is finished", job_exe.job_id, job_exe.node_id) was_job_finished = True cleanup_mgr.add_job_execution(job_exe) GPUManager.release_gpus(job_exe.node_id, job_exe.job_id) except Exception: cluster_id = JobExecution.parse_cluster_id(task_id) logger.exception('Error handling status update for job execution: %s', cluster_id) # Error handling status update, add task so it can be reconciled task = task_mgr.get_task(task_id) if task: recon_mgr.add_tasks([task]) else: # Not a job task, so must be either a node or system task node_mgr.handle_task_update(task_update) system_task_mgr.handle_task_update(task_update) scheduler_mgr.add_task_update_counts(was_task_finished, was_job_finished) duration = now() - started msg = 'Scheduler statusUpdate() took %.3f seconds' if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD: logger.warning(msg, duration.total_seconds()) else: logger.debug(msg, duration.total_seconds())
def setUp(self): django.setup() GPUManager.reset_gpu_dict() self.agent_id = 'agent_1'
def test_release_gpu(self): node_id = 7 job_id = 10 gpu_count = 2 required_gpus = 2 GPUManager.define_node_gpus(node_id, gpu_count) GPUManager.reserve_gpus_for_job(node_id, required_gpus) self.assertTrue( GPUManager.assign_gpus_for_job(node_id, job_id, required_gpus)) job_id = 11 self.assertFalse( GPUManager.reserve_gpus_for_job( node_id, required_gpus)) # shouldnt have enough GPUs GPUManager.release_gpus(node_id, 10) self.assertTrue(GPUManager.reserve_gpus_for_job( node_id, required_gpus)) #gpus should be avail again self.assertTrue( GPUManager.assign_gpus_for_job( node_id, job_id, required_gpus)) #gpus should be avail again nvidia_label = GPUManager.get_nvidia_docker_label(node_id, job_id) self.assertEqual(nvidia_label, "0,1")
def test_get_nvidia_label(self): node_id = 6 job_id = 10 gpu_count = 2 required_gpus = 2 GPUManager.define_node_gpus(node_id, gpu_count) GPUManager.reserve_gpus_for_job(node_id, required_gpus) GPUManager.assign_gpus_for_job(node_id, job_id, required_gpus) nvidia_label = GPUManager.get_nvidia_docker_label(node_id, job_id) self.assertEqual(nvidia_label, "0,1") gpu_count = 4 job_id = 11 GPUManager.define_node_gpus(node_id, gpu_count) GPUManager.reserve_gpus_for_job(node_id, required_gpus) GPUManager.assign_gpus_for_job(node_id, job_id, required_gpus) nvidia_label = GPUManager.get_nvidia_docker_label(node_id, job_id) self.assertEqual(nvidia_label, "2,3")
def setUp(self): GPUManager.reset_gpu_dict()
def test_assign_gpus(self): node_id = 5 job_id = 10 gpu_count = 2 required_gpus = 2 GPUManager.define_node_gpus(node_id, gpu_count) GPUManager.reserve_gpus_for_job(node_id, required_gpus) self.assertTrue( GPUManager.assign_gpus_for_job(node_id, job_id, required_gpus)) job_id = 11 self.assertFalse( GPUManager.reserve_gpus_for_job( node_id, required_gpus)) # shouldnt have enough GPUs gpu_count = 4 GPUManager.define_node_gpus(node_id, gpu_count) GPUManager.reserve_gpus_for_job(node_id, required_gpus) self.assertTrue( GPUManager.assign_gpus_for_job(node_id, job_id, required_gpus))
def test_add_additional_GPU(self): node_id = 3 gpu_count = 4 GPUManager.define_node_gpus(node_id, gpu_count) self.assertEqual(GPUManager.get_gpu_count_for_node(node_id), 4)
def test_add_new_node_gpus(self): node_id = 1 gpu_count = 3 GPUManager.define_node_gpus(node_id, gpu_count) self.assertEqual(GPUManager.get_gpu_count_for_node(node_id), gpu_count)
def _configure_all_tasks(self, config, job_exe, job_type): """Configures the given execution with items that apply to all tasks :param config: The execution configuration :type config: :class:`job.execution.configuration.json.exe_config.ExecutionConfiguration` :param job_exe: The job execution model being scheduled :type job_exe: :class:`job.models.JobExecution` :param job_type: The job type model :type job_type: :class:`job.models.JobType` """ config.set_task_ids(job_exe.get_cluster_id()) for task_type in config.get_task_types(): # Configure env vars describing allocated task resources env_vars = {} nvidia_docker_label = None for resource in config.get_resources(task_type).resources: env_name = 'ALLOCATED_%s' % normalize_env_var_name( resource.name) env_vars[ env_name] = '%.1f' % resource.value # Assumes scalar resources if resource.name == "gpus" and int(resource.value) > 0: gpu_list = GPUManager.get_nvidia_docker_label( job_exe.node_id, job_exe.job_id) nvidia_docker_label = DockerParameter( 'env', 'NVIDIA_VISIBLE_DEVICES={}'.format( gpu_list.strip(','))) # Configure env vars for Scale meta-data env_vars['SCALE_JOB_ID'] = unicode(job_exe.job_id) env_vars['SCALE_EXE_NUM'] = unicode(job_exe.exe_num) if job_exe.recipe_id: env_vars['SCALE_RECIPE_ID'] = unicode(job_exe.recipe_id) if job_exe.batch_id: env_vars['SCALE_BATCH_ID'] = unicode(job_exe.batch_id) # Configure workspace volumes workspace_volumes = {} for task_workspace in config.get_workspaces(task_type): logger.debug(self._workspaces) workspace_model = self._workspaces[task_workspace.name] # TODO: Should refactor workspace broker to return a Volume object and remove BrokerVolume if workspace_model.volume: vol_name = get_workspace_volume_name( job_exe, task_workspace.name) cont_path = get_workspace_volume_path(workspace_model.name) if workspace_model.volume.host: host_path = workspace_model.volume.remote_path volume = Volume(vol_name, cont_path, task_workspace.mode, is_host=True, host_path=host_path) else: driver = workspace_model.volume.driver driver_opts = {} # TODO: Hack alert for nfs broker, as stated above, we should return Volume from broker if driver == 'nfs': driver_opts = { 'share': workspace_model.volume.remote_path } volume = Volume(vol_name, cont_path, task_workspace.mode, is_host=False, driver=driver, driver_opts=driver_opts) workspace_volumes[task_workspace.name] = volume config.add_to_task(task_type, env_vars=env_vars, wksp_volumes=workspace_volumes) # Labels for metric grouping job_id_label = DockerParameter( 'label', 'scale-job-id={}'.format(job_exe.job_id)) job_execution_id_label = DockerParameter( 'label', 'scale-job-execution-id={}'.format(job_exe.exe_num)) job_type_name_label = DockerParameter( 'label', 'scale-job-type-name={}'.format(job_type.name)) job_type_version_label = DockerParameter( 'label', 'scale-job-type-version={}'.format(job_type.version)) main_label = DockerParameter('label', 'scale-task-type=main') if nvidia_docker_label: nvidia_runtime_param = DockerParameter('runtime', 'nvidia') config.add_to_task('main', docker_params=[ job_id_label, job_type_name_label, job_type_version_label, job_execution_id_label, main_label, nvidia_docker_label, nvidia_runtime_param ]) else: config.add_to_task('main', docker_params=[ job_id_label, job_type_name_label, job_type_version_label, job_execution_id_label, main_label ]) if not job_type.is_system: pre_label = DockerParameter('label', 'scale-task-type=pre') post_label = DockerParameter('label', 'scale-task-type=post') config.add_to_task('pre', docker_params=[ job_id_label, job_type_name_label, job_type_version_label, job_execution_id_label, pre_label ]) config.add_to_task('post', docker_params=[ job_id_label, job_type_name_label, job_type_version_label, job_execution_id_label, post_label ]) # Configure tasks for logging if settings.LOGGING_ADDRESS is not None: log_driver = DockerParameter('log-driver', 'fluentd') fluent_precision = DockerParameter( 'log-opt', 'fluentd-sub-second-precision=true') log_address = DockerParameter( 'log-opt', 'fluentd-address=%s' % settings.LOGGING_ADDRESS) if not job_type.is_system: pre_task_tag = DockerParameter( 'log-opt', 'tag=%s|%s|%s|%s|%s' % (config.get_task_id('pre'), job_type.name, job_type.version, job_exe.job_id, job_exe.exe_num)) config.add_to_task('pre', docker_params=[ log_driver, fluent_precision, log_address, pre_task_tag ]) post_task_tag = DockerParameter( 'log-opt', 'tag=%s|%s|%s|%s|%s' % (config.get_task_id('post'), job_type.name, job_type.version, job_exe.job_id, job_exe.exe_num)) config.add_to_task('post', docker_params=[ log_driver, fluent_precision, log_address, post_task_tag ]) # TODO: remove es_urls parameter when Scale no longer supports old style job types # Post task needs ElasticSearch URL to grab logs for old artifact registration es_param = DockerParameter( 'env', 'ELASTICSEARCH_URL=%s' % settings.ELASTICSEARCH_URL) config.add_to_task('post', docker_params=[es_param]) main_task_tag = DockerParameter( 'log-opt', 'tag=%s|%s|%s|%s|%s' % (config.get_task_id('main'), job_type.name, job_type.version, job_exe.job_id, job_exe.exe_num)) config.add_to_task('main', docker_params=[ log_driver, fluent_precision, log_address, main_task_tag ])