Beispiel #1
0
 def test_calls_where_node_has_no_gpus(self):
     node_id = 7
     job_id = 10
     gpu_count = 2
     required_gpus = 2
     GPUManager.define_node_gpus(node_id, gpu_count)
     node_id = 8
     self.assertFalse(
         GPUManager.reserve_gpus_for_job(node_id, required_gpus))
     self.assertFalse(
         GPUManager.assign_gpus_for_job(node_id, job_id, required_gpus))
     nvidia_label = GPUManager.get_nvidia_docker_label(node_id, job_id)
     self.assertEqual(nvidia_label, "")
Beispiel #2
0
    def accept_new_job_exe(self, job_exe):
        """Asks the node if it can accept the given new job execution

        :param job_exe: The new job execution
        :type job_exe: :class:`queue.job_exe.QueuedJobExecution`
        :returns: True if the new job execution was accepted, False otherwise
        :rtype: bool
        """

        if not self.is_ready_for_new_job:
            return False

        resources = job_exe.required_resources
        if self._remaining_resources.is_sufficient_to_meet(resources):

            if resources.gpus > 0:
                if not GPUManager.reserve_gpus_for_job(self.node_id, int(resources.gpus)):
                    return False
                    
            self._allocated_queued_job_exes.append(job_exe)
            self.allocated_resources.add(resources)
            self._remaining_resources.subtract(resources)
            job_exe.scheduled(self.agent_id, self.node_id, resources)

            return True

        return False
Beispiel #3
0
    def __init__(self, agent_id, node, tasks, running_job_exes, resource_set):
        """Constructor

        :param agent_id: The agent ID
        :type agent_id: string
        :param node: The node
        :type node: :class:`scheduler.node.node_class.Node`
        :param tasks: The current tasks running on the node
        :type tasks: :func:`list`
        :param running_job_exes: The current job executions running on the node
        :type running_job_exes: :func:`list`
        :param resource_set: The set of resources for the node
        :type resource_set: :class:`scheduler.resources.agent.ResourceSet`
        """

        self.agent_id = agent_id  # Set agent ID separately from node since it can change during scheduling
        self.hostname = node.hostname
        self.node_id = node.id
        self.is_ready_for_new_job = node.is_ready_for_new_job(
        )  # Cache this for consistency
        self.is_ready_for_next_job_task = node.is_ready_for_next_job_task(
        )  # Cache this for consistency
        self.is_ready_for_system_task = node.is_ready_for_system_task(
        )  # Cache this for consistency
        self.allocated_offers = []
        self.allocated_resources = NodeResources()
        self.allocated_tasks = [
        ]  # Tasks that have been allocated resources from this node

        self._node = node
        self._allocated_queued_job_exes = [
        ]  # New queued job executions that have been allocated resources
        self._allocated_running_job_exes = [
        ]  # Running job executions that have been allocated resources
        self._running_job_exes = running_job_exes
        self._running_tasks = tasks

        self._offered_resources = NodeResources(
        )  # The amount of resources that were originally offered
        self._offered_resources.add(resource_set.offered_resources)
        self._remaining_resources = NodeResources()
        self._remaining_resources.add(self._offered_resources)
        self._task_resources = resource_set.task_resources
        self._watermark_resources = resource_set.watermark_resources
        if int(resource_set.offered_resources.gpus) > 0:
            GPUManager.define_node_gpus(
                self.node_id, int(resource_set.offered_resources.gpus))
Beispiel #4
0
    def test_add_less_gpu(self):
        node_id = 2
        gpu_count = 3
        GPUManager.define_node_gpus(node_id, gpu_count)
        self.assertEqual(GPUManager.get_gpu_count_for_node(node_id), 3)

        gpu_count = 1
        GPUManager.define_node_gpus(node_id, gpu_count)
        self.assertEqual(GPUManager.get_gpu_count_for_node(node_id), 3)
Beispiel #5
0
    def test_reserve_gpu(self):
        node_id = 4
        gpu_count = 2
        required_gpus = 2
        GPUManager.define_node_gpus(node_id, gpu_count)
        self.assertTrue(GPUManager.reserve_gpus_for_job(
            node_id, required_gpus))

        job_id = 11
        self.assertFalse(
            GPUManager.reserve_gpus_for_job(node_id, required_gpus))

        gpu_count = 4
        GPUManager.define_node_gpus(node_id, gpu_count)
        self.assertTrue(GPUManager.reserve_gpus_for_job(
            node_id, required_gpus))
Beispiel #6
0
    def create_job_exe_model(self, framework_id, when):
        """Creates and returns a scheduled job execution model

        :param framework_id: The scheduling framework ID
        :type framework_id: string
        :param when: The start time
        :type when: :class:`datetime.datetime`
        :returns: The job execution model
        :rtype: :class:`job.models.JobExecution`
        """

        job_exe = JobExecution()
        job_exe.job_id = self._queue.job_id
        job_exe.job_type_id = self._queue.job_type_id
        job_exe.recipe_id = self._queue.recipe_id
        job_exe.batch_id = self._queue.batch_id
        job_exe.exe_num = self._queue.exe_num
        job_exe.timeout = self._queue.timeout
        job_exe.docker_image = self._queue.docker_image
        job_exe.input_file_size = self._queue.input_file_size
        job_exe.configuration = self.configuration.get_dict()
        job_exe.queued = self._queue.queued

        if self.is_canceled:
            job_exe.node_id = None
            job_exe.resources = NodeResources().get_json().get_dict()
            job_exe.started = None
        else:
            job_exe.node_id = self._scheduled_node_id
            job_exe.resources = self._scheduled_resources.get_json().get_dict()
            job_exe.started = when

        job_exe.set_cluster_id(framework_id, self._queue.job_id, self._queue.exe_num)

        if self.required_resources.gpus > 0:
            if not GPUManager.assign_gpus_for_job(job_exe.node_id,job_exe.job_id, self.required_resources.gpus):
                logger.error("Job %s was unable to assign %s reserved GPUs on node %s. Note: this is not supposed to be able to happen. something has gone horribly wrong.", job_exe.job_id, self.required_resources.gpus, job_exe.node_id)

        return job_exe
Beispiel #7
0
    def update(self, status):
        """
        Invoked when the status of a task has changed (e.g., a slave is lost
        and so the task is lost, a task finishes and an executor sends a
        status update saying so, etc.) Note that returning from this callback
        acknowledges receipt of this status update.  If for whatever reason
        the scheduler aborts during this callback (or the process exits)
        another status update will be delivered.  Note, however, that this is
        currently not true if the slave sending the status update is lost or
        fails during that time.
        """

        started = now()

        model = utils.create_task_update_model(status)
        mesos_status = model.status
        task_update = TaskStatusUpdate(model, utils.get_status_agent_id(status), utils.get_status_data(status))
        task_id = task_update.task_id
        was_task_finished = task_update.status in TaskStatusUpdate.TERMINAL_STATUSES
        was_job_finished = False

        if mesos_status == 'TASK_ERROR':
            logger.error('Status update for task %s: %s', task_id, mesos_status)
        if mesos_status == 'TASK_LOST':
            logger.warning('Status update for task %s: %s', task_id, mesos_status)
        else:
            logger.info('Status update for task %s: %s', task_id, mesos_status)

        # Since we have a status update for this task, remove it from reconciliation set
        recon_mgr.remove_task_id(task_id)

        # Hand off task update to be saved in the database
        if task_id.startswith(JOB_TASK_ID_PREFIX):
            # Grab job execution ID from manager
            cluster_id = JobExecution.parse_cluster_id(task_id)
            job_exe = job_exe_mgr.get_running_job_exe(cluster_id)
            if job_exe:
                model.job_exe_id = job_exe.id
        task_update_mgr.add_task_update(model)

        # Update task with latest status
        # This should happen before the job execution or node manager are updated, since they will assume that the task
        # has already been updated
        task_mgr.handle_task_update(task_update)

        if task_id.startswith(JOB_TASK_ID_PREFIX):
            # Job task, so update the job execution
            try:
                job_exe = job_exe_mgr.handle_task_update(task_update)
                if job_exe and job_exe.is_finished():
                    logger.info("job_exe with job id %s and node id %s is finished", job_exe.job_id, job_exe.node_id)
                    was_job_finished = True
                    cleanup_mgr.add_job_execution(job_exe)
                    GPUManager.release_gpus(job_exe.node_id, job_exe.job_id)

            except Exception:
                cluster_id = JobExecution.parse_cluster_id(task_id)
                logger.exception('Error handling status update for job execution: %s', cluster_id)
                # Error handling status update, add task so it can be reconciled
                task = task_mgr.get_task(task_id)
                if task:
                    recon_mgr.add_tasks([task])
        else:
            # Not a job task, so must be either a node or system task
            node_mgr.handle_task_update(task_update)
            system_task_mgr.handle_task_update(task_update)

        scheduler_mgr.add_task_update_counts(was_task_finished, was_job_finished)

        duration = now() - started
        msg = 'Scheduler statusUpdate() took %.3f seconds'
        if duration > ScaleScheduler.NORMAL_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())
Beispiel #8
0
 def setUp(self):
     django.setup()
     GPUManager.reset_gpu_dict()
     self.agent_id = 'agent_1'
Beispiel #9
0
    def test_release_gpu(self):
        node_id = 7
        job_id = 10
        gpu_count = 2
        required_gpus = 2
        GPUManager.define_node_gpus(node_id, gpu_count)
        GPUManager.reserve_gpus_for_job(node_id, required_gpus)
        self.assertTrue(
            GPUManager.assign_gpus_for_job(node_id, job_id, required_gpus))

        job_id = 11

        self.assertFalse(
            GPUManager.reserve_gpus_for_job(
                node_id, required_gpus))  # shouldnt have enough GPUs

        GPUManager.release_gpus(node_id, 10)
        self.assertTrue(GPUManager.reserve_gpus_for_job(
            node_id, required_gpus))  #gpus should be avail again
        self.assertTrue(
            GPUManager.assign_gpus_for_job(
                node_id, job_id, required_gpus))  #gpus should be avail again
        nvidia_label = GPUManager.get_nvidia_docker_label(node_id, job_id)
        self.assertEqual(nvidia_label, "0,1")
Beispiel #10
0
    def test_get_nvidia_label(self):
        node_id = 6
        job_id = 10
        gpu_count = 2
        required_gpus = 2
        GPUManager.define_node_gpus(node_id, gpu_count)
        GPUManager.reserve_gpus_for_job(node_id, required_gpus)
        GPUManager.assign_gpus_for_job(node_id, job_id, required_gpus)
        nvidia_label = GPUManager.get_nvidia_docker_label(node_id, job_id)
        self.assertEqual(nvidia_label, "0,1")

        gpu_count = 4
        job_id = 11
        GPUManager.define_node_gpus(node_id, gpu_count)
        GPUManager.reserve_gpus_for_job(node_id, required_gpus)
        GPUManager.assign_gpus_for_job(node_id, job_id, required_gpus)
        nvidia_label = GPUManager.get_nvidia_docker_label(node_id, job_id)
        self.assertEqual(nvidia_label, "2,3")
Beispiel #11
0
 def setUp(self):
     GPUManager.reset_gpu_dict()
Beispiel #12
0
    def test_assign_gpus(self):
        node_id = 5
        job_id = 10
        gpu_count = 2
        required_gpus = 2
        GPUManager.define_node_gpus(node_id, gpu_count)
        GPUManager.reserve_gpus_for_job(node_id, required_gpus)
        self.assertTrue(
            GPUManager.assign_gpus_for_job(node_id, job_id, required_gpus))

        job_id = 11
        self.assertFalse(
            GPUManager.reserve_gpus_for_job(
                node_id, required_gpus))  # shouldnt have enough GPUs

        gpu_count = 4
        GPUManager.define_node_gpus(node_id, gpu_count)
        GPUManager.reserve_gpus_for_job(node_id, required_gpus)
        self.assertTrue(
            GPUManager.assign_gpus_for_job(node_id, job_id, required_gpus))
Beispiel #13
0
 def test_add_additional_GPU(self):
     node_id = 3
     gpu_count = 4
     GPUManager.define_node_gpus(node_id, gpu_count)
     self.assertEqual(GPUManager.get_gpu_count_for_node(node_id), 4)
Beispiel #14
0
 def test_add_new_node_gpus(self):
     node_id = 1
     gpu_count = 3
     GPUManager.define_node_gpus(node_id, gpu_count)
     self.assertEqual(GPUManager.get_gpu_count_for_node(node_id), gpu_count)
Beispiel #15
0
    def _configure_all_tasks(self, config, job_exe, job_type):
        """Configures the given execution with items that apply to all tasks

        :param config: The execution configuration
        :type config: :class:`job.execution.configuration.json.exe_config.ExecutionConfiguration`
        :param job_exe: The job execution model being scheduled
        :type job_exe: :class:`job.models.JobExecution`
        :param job_type: The job type model
        :type job_type: :class:`job.models.JobType`
        """

        config.set_task_ids(job_exe.get_cluster_id())

        for task_type in config.get_task_types():
            # Configure env vars describing allocated task resources
            env_vars = {}
            nvidia_docker_label = None

            for resource in config.get_resources(task_type).resources:
                env_name = 'ALLOCATED_%s' % normalize_env_var_name(
                    resource.name)
                env_vars[
                    env_name] = '%.1f' % resource.value  # Assumes scalar resources
                if resource.name == "gpus" and int(resource.value) > 0:
                    gpu_list = GPUManager.get_nvidia_docker_label(
                        job_exe.node_id, job_exe.job_id)
                    nvidia_docker_label = DockerParameter(
                        'env', 'NVIDIA_VISIBLE_DEVICES={}'.format(
                            gpu_list.strip(',')))

            # Configure env vars for Scale meta-data
            env_vars['SCALE_JOB_ID'] = unicode(job_exe.job_id)
            env_vars['SCALE_EXE_NUM'] = unicode(job_exe.exe_num)
            if job_exe.recipe_id:
                env_vars['SCALE_RECIPE_ID'] = unicode(job_exe.recipe_id)
            if job_exe.batch_id:
                env_vars['SCALE_BATCH_ID'] = unicode(job_exe.batch_id)

            # Configure workspace volumes
            workspace_volumes = {}
            for task_workspace in config.get_workspaces(task_type):
                logger.debug(self._workspaces)
                workspace_model = self._workspaces[task_workspace.name]
                # TODO: Should refactor workspace broker to return a Volume object and remove BrokerVolume
                if workspace_model.volume:
                    vol_name = get_workspace_volume_name(
                        job_exe, task_workspace.name)
                    cont_path = get_workspace_volume_path(workspace_model.name)
                    if workspace_model.volume.host:
                        host_path = workspace_model.volume.remote_path
                        volume = Volume(vol_name,
                                        cont_path,
                                        task_workspace.mode,
                                        is_host=True,
                                        host_path=host_path)
                    else:
                        driver = workspace_model.volume.driver
                        driver_opts = {}
                        # TODO: Hack alert for nfs broker, as stated above, we should return Volume from broker
                        if driver == 'nfs':
                            driver_opts = {
                                'share': workspace_model.volume.remote_path
                            }
                        volume = Volume(vol_name,
                                        cont_path,
                                        task_workspace.mode,
                                        is_host=False,
                                        driver=driver,
                                        driver_opts=driver_opts)
                    workspace_volumes[task_workspace.name] = volume

            config.add_to_task(task_type,
                               env_vars=env_vars,
                               wksp_volumes=workspace_volumes)

        # Labels for metric grouping
        job_id_label = DockerParameter(
            'label', 'scale-job-id={}'.format(job_exe.job_id))
        job_execution_id_label = DockerParameter(
            'label', 'scale-job-execution-id={}'.format(job_exe.exe_num))
        job_type_name_label = DockerParameter(
            'label', 'scale-job-type-name={}'.format(job_type.name))
        job_type_version_label = DockerParameter(
            'label', 'scale-job-type-version={}'.format(job_type.version))
        main_label = DockerParameter('label', 'scale-task-type=main')
        if nvidia_docker_label:
            nvidia_runtime_param = DockerParameter('runtime', 'nvidia')
            config.add_to_task('main',
                               docker_params=[
                                   job_id_label, job_type_name_label,
                                   job_type_version_label,
                                   job_execution_id_label, main_label,
                                   nvidia_docker_label, nvidia_runtime_param
                               ])
        else:
            config.add_to_task('main',
                               docker_params=[
                                   job_id_label, job_type_name_label,
                                   job_type_version_label,
                                   job_execution_id_label, main_label
                               ])

        if not job_type.is_system:
            pre_label = DockerParameter('label', 'scale-task-type=pre')
            post_label = DockerParameter('label', 'scale-task-type=post')
            config.add_to_task('pre',
                               docker_params=[
                                   job_id_label, job_type_name_label,
                                   job_type_version_label,
                                   job_execution_id_label, pre_label
                               ])
            config.add_to_task('post',
                               docker_params=[
                                   job_id_label, job_type_name_label,
                                   job_type_version_label,
                                   job_execution_id_label, post_label
                               ])

        # Configure tasks for logging
        if settings.LOGGING_ADDRESS is not None:
            log_driver = DockerParameter('log-driver', 'fluentd')
            fluent_precision = DockerParameter(
                'log-opt', 'fluentd-sub-second-precision=true')
            log_address = DockerParameter(
                'log-opt', 'fluentd-address=%s' % settings.LOGGING_ADDRESS)
            if not job_type.is_system:
                pre_task_tag = DockerParameter(
                    'log-opt', 'tag=%s|%s|%s|%s|%s' %
                    (config.get_task_id('pre'), job_type.name,
                     job_type.version, job_exe.job_id, job_exe.exe_num))
                config.add_to_task('pre',
                                   docker_params=[
                                       log_driver, fluent_precision,
                                       log_address, pre_task_tag
                                   ])
                post_task_tag = DockerParameter(
                    'log-opt', 'tag=%s|%s|%s|%s|%s' %
                    (config.get_task_id('post'), job_type.name,
                     job_type.version, job_exe.job_id, job_exe.exe_num))
                config.add_to_task('post',
                                   docker_params=[
                                       log_driver, fluent_precision,
                                       log_address, post_task_tag
                                   ])
                # TODO: remove es_urls parameter when Scale no longer supports old style job types

                # Post task needs ElasticSearch URL to grab logs for old artifact registration
                es_param = DockerParameter(
                    'env', 'ELASTICSEARCH_URL=%s' % settings.ELASTICSEARCH_URL)
                config.add_to_task('post', docker_params=[es_param])
            main_task_tag = DockerParameter(
                'log-opt', 'tag=%s|%s|%s|%s|%s' %
                (config.get_task_id('main'), job_type.name, job_type.version,
                 job_exe.job_id, job_exe.exe_num))
            config.add_to_task('main',
                               docker_params=[
                                   log_driver, fluent_precision, log_address,
                                   main_task_tag
                               ])