コード例 #1
0
ファイル: manager.py プロジェクト: kaydoh/scale
    def _process_queue(self, nodes, job_types, job_type_limits,
                       job_type_resources, workspaces):
        """Retrieves the top of the queue and schedules new job executions on available nodes as resources and limits
        allow

        :param nodes: The dict of scheduling nodes stored by node ID for all nodes ready to accept new job executions
        :type nodes: dict
        :param job_types: The dict of job type models stored by job type ID
        :type job_types: dict
        :param job_type_limits: The dict of job type IDs mapping to job type limits
        :type job_type_limits: dict
        :param job_type_resources: The list of all of the job type resource requirements
        :type job_type_resources: list
        :param workspaces: A dict of all workspaces stored by name
        :type workspaces: dict
        :returns: The list of queued job executions that were scheduled
        :rtype: list
        """

        scheduled_job_executions = []
        started = now()
        type_warnings = {}

        # We can schedule as long as there are nodes
        if not nodes:
            logger.warning(
                'There are no nodes available. Waiting to schedule until there are free resources...'
            )
            return scheduled_job_executions

        ignore_job_type_ids = self._calculate_job_types_to_ignore(
            job_types, job_type_limits)
        max_cluster_resources = resource_mgr.get_max_available_resources()
        for queue in Queue.objects.get_queue(
                scheduler_mgr.config.queue_mode,
                ignore_job_type_ids)[:QUEUE_LIMIT]:
            job_exe = QueuedJobExecution(queue)

            # Canceled job executions get processed as scheduled executions
            if job_exe.is_canceled:
                scheduled_job_executions.append(job_exe)
                continue

            jt = job_type_mgr.get_job_type(queue.job_type.id)
            name = INVALID_RESOURCES.name + jt.name
            title = INVALID_RESOURCES.title % jt.name
            warning = SchedulerWarning(name=name,
                                       title=title,
                                       description=None)
            if jt.unmet_resources and scheduler_mgr.is_warning_active(warning):
                # previously checked this job type and found we lacked resources; wait until warning is inactive to check again
                continue

            invalid_resources = []
            insufficient_resources = []
            # get resource names offered and compare to job type resources
            for resource in job_exe.required_resources.resources:
                # Check for invalid resource or sharedmem
                if (resource.name not in max_cluster_resources._resources) or (
                        resource.name.lower() == 'sharedmem'):
                    # Skip sharedmem if its 0
                    if (resource.name.lower()
                            == 'sharedmem') and (resource.value <= 0):
                        continue
                    if jt.name in type_warnings:
                        type_warnings[jt.name]['count'] += 1
                        if resource.name not in type_warnings[
                                jt.name]['warning']:
                            type_warnings[jt.name]['warning'] += (
                                ', %s' % resource.name)
                    else:
                        type_warnings[jt.name] = {
                            'warning':
                            '%s job types could not be scheduled as the following resources do not exist in the available cluster resources: %s'
                            % (jt.name, resource.name),
                            'count':
                            1
                        }
                    # resource does not exist in cluster
                    invalid_resources.append(resource.name)
                elif resource.value > max_cluster_resources._resources[
                        resource.name].value:
                    # resource exceeds the max available from any node
                    insufficient_resources.append(resource.name)

            if invalid_resources:
                description = INVALID_RESOURCES.description % invalid_resources
                scheduler_mgr.warning_active(warning, description)

            if insufficient_resources:
                description = INSUFFICIENT_RESOURCES.description % insufficient_resources
                scheduler_mgr.warning_active(warning, description)

            if invalid_resources or insufficient_resources:
                invalid_resources.extend(insufficient_resources)
                jt.unmet_resources = ','.join(invalid_resources)
                jt.save(update_fields=["unmet_resources"])
                continue
            else:
                # reset unmet_resources flag
                jt.unmet_resources = None
                scheduler_mgr.warning_inactive(warning)
                jt.save(update_fields=["unmet_resources"])

            # Make sure execution's job type and workspaces have been synced to the scheduler
            job_type_id = queue.job_type_id
            if job_type_id not in job_types:
                scheduler_mgr.warning_active(
                    UNKNOWN_JOB_TYPE,
                    description=UNKNOWN_JOB_TYPE.description % job_type_id)
                continue

            workspace_names = job_exe.configuration.get_input_workspace_names()
            workspace_names.extend(
                job_exe.configuration.get_output_workspace_names())

            missing_workspace = False
            for name in workspace_names:
                missing_workspace = missing_workspace or name not in workspaces
            if missing_workspace:
                if jt.name in type_warnings:
                    type_warnings[jt.name]['count'] += 1
                else:
                    type_warnings[jt.name] = {
                        'warning':
                        '%s job types could not be scheduled due to missing workspace'
                        % jt.name,
                        'count':
                        1
                    }
                continue

            # Check limit for this execution's job type
            if job_type_id in job_type_limits and job_type_limits[
                    job_type_id] < 1:
                if jt.name in type_warnings:
                    type_warnings[jt.name]['count'] += 1
                else:
                    type_warnings[jt.name] = {
                        'warning':
                        '%s job types could not be scheduled due to scheduling limit reached'
                        % jt.name,
                        'count':
                        1
                    }
                continue

            # Try to schedule job execution and adjust job type limit if needed
            if self._schedule_new_job_exe(job_exe, nodes, job_type_resources):
                scheduled_job_executions.append(job_exe)
                if job_type_id in job_type_limits:
                    job_type_limits[job_type_id] -= 1

        duration = now() - started
        if type_warnings:
            for warn in type_warnings:
                logger.warning('%d %s', type_warnings[warn]['count'],
                               type_warnings[warn]['warning'])

        msg = 'Processing queue took %.3f seconds'
        if duration > PROCESS_QUEUE_WARN_THRESHOLD:
            logger.warning(msg, duration.total_seconds())
        else:
            logger.debug(msg, duration.total_seconds())

        return scheduled_job_executions
コード例 #2
0
ファイル: manager.py プロジェクト: kaydoh/scale
# Maximum number of jobs to grab off of the queue at one time
QUEUE_LIMIT = scale_settings.SCHEDULER_QUEUE_LIMIT
# Warning threshold for scheduling query duration
SCHEDULE_QUERY_WARN_THRESHOLD = datetime.timedelta(milliseconds=300)
# Warning threshold for task launch duration
LAUNCH_TASK_WARN_THRESHOLD = datetime.timedelta(milliseconds=300)

# It is considered a resource shortage if a task waits this many generations without being scheduled
TASK_SHORTAGE_WAIT_COUNT = 10

logger = logging.getLogger(__name__)

# Warnings
INVALID_RESOURCES = SchedulerWarning(
    name='INVALID_RESOURCES',
    title='Invalid Resources for %s',
    description=
    'Cluster does not have one or more of the following resources: %s.')
INSUFFICIENT_RESOURCES = SchedulerWarning(
    name='INSUFFICIENT_RESOURCES',
    title='Insufficient Resources for %s',
    description='No node has enough of this resource for the job type: %s.')
WAITING_SYSTEM_TASKS = SchedulerWarning(
    name='WAITING_SYSTEM_TASKS',
    title='Waiting System Tasks',
    description='No new jobs scheduled due to waiting system tasks')
UNKNOWN_JOB_TYPE = SchedulerWarning(
    name='UNKNOWN_JOB_TYPE',
    title='Unknown Job Type',
    description=
    'A job is queued with a job type %d that is not in the data base')