Beispiel #1
0
    def test_find_largest_idle_instance(self, instances, instance_fit_index,
                                        reason):
        def filter_offline_instances(*args):
            return filter(lambda i: i.capacity > 0, instances)

        ig = InstanceGroup(id=10)
        instances_online_only = filter_offline_instances(instances)

        if instance_fit_index is None:
            assert ig.find_largest_idle_instance(
                instances_online_only) is None, reason
        else:
            assert ig.find_largest_idle_instance(
                instances_online_only) == instances[instance_fit_index], reason
Beispiel #2
0
    def test_find_largest_idle_instance(self, instances, instance_fit_index, reason):
        def filter_offline_instances(*args):
            return filter(lambda i: i.capacity > 0, instances)

        with mock.patch.object(InstanceGroup,
                               'instances',
                               Mock(spec_set=['filter'],
                                    filter=lambda *args, **kargs: Mock(spec_set=['order_by'],
                                                                       order_by=filter_offline_instances))):
            ig = InstanceGroup(id=10)

            if instance_fit_index is None:
                assert ig.find_largest_idle_instance() is None, reason
            else:
                assert ig.find_largest_idle_instance() == \
                    instances[instance_fit_index], reason
Beispiel #3
0
    def process_pending_tasks(self, pending_tasks):
        running_workflow_templates = set([wf.unified_job_template_id for wf in self.get_running_workflow_jobs()])
        for task in pending_tasks:
            if self.start_task_limit <= 0:
                break
            if self.is_job_blocked(task):
                logger.debug("{} is blocked from running".format(task.log_format))
                continue
            preferred_instance_groups = task.preferred_instance_groups
            found_acceptable_queue = False
            if isinstance(task, WorkflowJob):
                if task.unified_job_template_id in running_workflow_templates:
                    if not task.allow_simultaneous:
                        logger.debug("{} is blocked from running, workflow already running".format(task.log_format))
                        continue
                else:
                    running_workflow_templates.add(task.unified_job_template_id)
                self.start_task(task, None, task.get_jobs_fail_chain(), None)
                continue
            for rampart_group in preferred_instance_groups:
                if task.can_run_containerized and rampart_group.is_containerized:
                    self.graph[rampart_group.name]['graph'].add_job(task)
                    self.start_task(task, rampart_group, task.get_jobs_fail_chain(), None)
                    found_acceptable_queue = True
                    break

                remaining_capacity = self.get_remaining_capacity(rampart_group.name)
                if not rampart_group.is_containerized and self.get_remaining_capacity(rampart_group.name) <= 0:
                    logger.debug("Skipping group {}, remaining_capacity {} <= 0".format(
                                 rampart_group.name, remaining_capacity))
                    continue

                execution_instance = InstanceGroup.fit_task_to_most_remaining_capacity_instance(task, self.graph[rampart_group.name]['instances']) or \
                    InstanceGroup.find_largest_idle_instance(self.graph[rampart_group.name]['instances'])

                if execution_instance or rampart_group.is_containerized:
                    if not rampart_group.is_containerized:
                        execution_instance.remaining_capacity = max(0, execution_instance.remaining_capacity - task.task_impact)
                        execution_instance.jobs_running += 1
                        logger.debug("Starting {} in group {} instance {} (remaining_capacity={})".format(
                                     task.log_format, rampart_group.name, execution_instance.hostname, remaining_capacity))

                    execution_instance = self.real_instances[execution_instance.hostname]
                    self.graph[rampart_group.name]['graph'].add_job(task)
                    self.start_task(task, rampart_group, task.get_jobs_fail_chain(), execution_instance)
                    found_acceptable_queue = True
                    break
                else:
                    logger.debug("No instance available in group {} to run job {} w/ capacity requirement {}".format(
                                 rampart_group.name, task.log_format, task.task_impact))
            if not found_acceptable_queue:
                logger.debug("{} couldn't be scheduled on graph, waiting for next cycle".format(task.log_format))
Beispiel #4
0
    def process_pending_tasks(self, pending_tasks):
        running_workflow_templates = set([wf.unified_job_template_id for wf in self.get_running_workflow_jobs()])
        tasks_to_update_job_explanation = []
        for task in pending_tasks:
            if self.start_task_limit <= 0:
                break
            blocked_by = self.job_blocked_by(task)
            if blocked_by:
                task.log_lifecycle("blocked", blocked_by=blocked_by)
                job_explanation = gettext_noop(f"waiting for {blocked_by._meta.model_name}-{blocked_by.id} to finish")
                if task.job_explanation != job_explanation:
                    if task.created < (tz_now() - self.time_delta_job_explanation):
                        task.job_explanation = job_explanation
                        tasks_to_update_job_explanation.append(task)
                continue
            preferred_instance_groups = task.preferred_instance_groups

            found_acceptable_queue = False
            if isinstance(task, WorkflowJob):
                if task.unified_job_template_id in running_workflow_templates:
                    if not task.allow_simultaneous:
                        logger.debug("{} is blocked from running, workflow already running".format(task.log_format))
                        continue
                else:
                    running_workflow_templates.add(task.unified_job_template_id)
                self.start_task(task, None, task.get_jobs_fail_chain(), None)
                continue

            for rampart_group in preferred_instance_groups:
                if task.capacity_type == 'execution' and rampart_group.is_container_group:
                    self.graph[rampart_group.name]['graph'].add_job(task)
                    self.start_task(task, rampart_group, task.get_jobs_fail_chain(), None)
                    found_acceptable_queue = True
                    break

                # TODO: remove this after we have confidence that OCP control nodes are reporting node_type=control
                if settings.IS_K8S and task.capacity_type == 'execution':
                    logger.debug("Skipping group {}, task cannot run on control plane".format(rampart_group.name))
                    continue

                remaining_capacity = self.get_remaining_capacity(rampart_group.name, capacity_type=task.capacity_type)
                if task.task_impact > 0 and remaining_capacity <= 0:
                    logger.debug("Skipping group {}, remaining_capacity {} <= 0".format(rampart_group.name, remaining_capacity))
                    continue

                execution_instance = InstanceGroup.fit_task_to_most_remaining_capacity_instance(
                    task, self.graph[rampart_group.name]['instances']
                ) or InstanceGroup.find_largest_idle_instance(self.graph[rampart_group.name]['instances'], capacity_type=task.capacity_type)

                if execution_instance or rampart_group.is_container_group:
                    if not rampart_group.is_container_group:
                        execution_instance.remaining_capacity = max(0, execution_instance.remaining_capacity - task.task_impact)
                        execution_instance.jobs_running += 1
                        logger.debug(
                            "Starting {} in group {} instance {} (remaining_capacity={})".format(
                                task.log_format, rampart_group.name, execution_instance.hostname, remaining_capacity
                            )
                        )

                    if execution_instance:
                        execution_instance = self.real_instances[execution_instance.hostname]
                    self.graph[rampart_group.name]['graph'].add_job(task)
                    self.start_task(task, rampart_group, task.get_jobs_fail_chain(), execution_instance)
                    found_acceptable_queue = True
                    break
                else:
                    logger.debug(
                        "No instance available in group {} to run job {} w/ capacity requirement {}".format(
                            rampart_group.name, task.log_format, task.task_impact
                        )
                    )
            if not found_acceptable_queue:
                task.log_lifecycle("needs_capacity")
                job_explanation = gettext_noop("This job is not ready to start because there is not enough available capacity.")
                if task.job_explanation != job_explanation:
                    if task.created < (tz_now() - self.time_delta_job_explanation):
                        # Many launched jobs are immediately blocked, but most blocks will resolve in a few seconds.
                        # Therefore we should only update the job_explanation after some time has elapsed to
                        # prevent excessive task saves.
                        task.job_explanation = job_explanation
                        tasks_to_update_job_explanation.append(task)
                logger.debug("{} couldn't be scheduled on graph, waiting for next cycle".format(task.log_format))
        UnifiedJob.objects.bulk_update(tasks_to_update_job_explanation, ['job_explanation'])
Beispiel #5
0
    def process_pending_tasks(self, pending_tasks):
        running_workflow_templates = {wf.unified_job_template_id for wf in self.get_running_workflow_jobs()}
        tasks_to_update_job_explanation = []
        for task in pending_tasks:
            if self.start_task_limit <= 0:
                break
            blocked_by = self.job_blocked_by(task)
            if blocked_by:
                task.log_lifecycle("blocked", blocked_by=blocked_by)
                job_explanation = gettext_noop(f"waiting for {blocked_by._meta.model_name}-{blocked_by.id} to finish")
                if task.job_explanation != job_explanation:
                    if task.created < (tz_now() - self.time_delta_job_explanation):
                        task.job_explanation = job_explanation
                        tasks_to_update_job_explanation.append(task)
                continue

            found_acceptable_queue = False
            preferred_instance_groups = task.preferred_instance_groups

            if isinstance(task, WorkflowJob):
                if task.unified_job_template_id in running_workflow_templates:
                    if not task.allow_simultaneous:
                        logger.debug("{} is blocked from running, workflow already running".format(task.log_format))
                        continue
                else:
                    running_workflow_templates.add(task.unified_job_template_id)
                self.start_task(task, None, task.get_jobs_fail_chain(), None)
                continue

            # Determine if there is control capacity for the task
            if task.capacity_type == 'control':
                control_impact = task.task_impact + settings.AWX_CONTROL_NODE_TASK_IMPACT
            else:
                control_impact = settings.AWX_CONTROL_NODE_TASK_IMPACT
            control_instance = InstanceGroup.fit_task_to_most_remaining_capacity_instance(
                task, self.graph[settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME]['instances'], impact=control_impact, capacity_type='control'
            )
            if not control_instance:
                self.task_needs_capacity(task, tasks_to_update_job_explanation)
                logger.debug(f"Skipping task {task.log_format} in pending, not enough capacity left on controlplane to control new tasks")
                continue

            task.controller_node = control_instance.hostname

            # All task.capacity_type == 'control' jobs should run on control plane, no need to loop over instance groups
            if task.capacity_type == 'control':
                task.execution_node = control_instance.hostname
                control_instance.remaining_capacity = max(0, control_instance.remaining_capacity - control_impact)
                control_instance.jobs_running += 1
                self.dependency_graph.add_job(task)
                execution_instance = self.real_instances[control_instance.hostname]
                task.log_lifecycle("controller_node_chosen")
                task.log_lifecycle("execution_node_chosen")
                self.start_task(task, self.controlplane_ig, task.get_jobs_fail_chain(), execution_instance)
                found_acceptable_queue = True
                continue

            for rampart_group in preferred_instance_groups:
                if rampart_group.is_container_group:
                    control_instance.jobs_running += 1
                    self.dependency_graph.add_job(task)
                    self.start_task(task, rampart_group, task.get_jobs_fail_chain(), None)
                    found_acceptable_queue = True
                    break

                # TODO: remove this after we have confidence that OCP control nodes are reporting node_type=control
                if settings.IS_K8S and task.capacity_type == 'execution':
                    logger.debug("Skipping group {}, task cannot run on control plane".format(rampart_group.name))
                    continue
                # at this point we know the instance group is NOT a container group
                # because if it was, it would have started the task and broke out of the loop.
                execution_instance = InstanceGroup.fit_task_to_most_remaining_capacity_instance(
                    task, self.graph[rampart_group.name]['instances'], add_hybrid_control_cost=True
                ) or InstanceGroup.find_largest_idle_instance(self.graph[rampart_group.name]['instances'], capacity_type=task.capacity_type)

                if execution_instance:
                    task.execution_node = execution_instance.hostname
                    # If our execution instance is a hybrid, prefer to do control tasks there as well.
                    if execution_instance.node_type == 'hybrid':
                        control_instance = execution_instance
                        task.controller_node = execution_instance.hostname

                    control_instance.remaining_capacity = max(0, control_instance.remaining_capacity - settings.AWX_CONTROL_NODE_TASK_IMPACT)
                    task.log_lifecycle("controller_node_chosen")
                    if control_instance != execution_instance:
                        control_instance.jobs_running += 1
                    execution_instance.remaining_capacity = max(0, execution_instance.remaining_capacity - task.task_impact)
                    execution_instance.jobs_running += 1
                    task.log_lifecycle("execution_node_chosen")
                    logger.debug(
                        "Starting {} in group {} instance {} (remaining_capacity={})".format(
                            task.log_format, rampart_group.name, execution_instance.hostname, execution_instance.remaining_capacity
                        )
                    )
                    execution_instance = self.real_instances[execution_instance.hostname]
                    self.dependency_graph.add_job(task)
                    self.start_task(task, rampart_group, task.get_jobs_fail_chain(), execution_instance)
                    found_acceptable_queue = True
                    break
                else:
                    logger.debug(
                        "No instance available in group {} to run job {} w/ capacity requirement {}".format(
                            rampart_group.name, task.log_format, task.task_impact
                        )
                    )
            if not found_acceptable_queue:
                self.task_needs_capacity(task, tasks_to_update_job_explanation)
        UnifiedJob.objects.bulk_update(tasks_to_update_job_explanation, ['job_explanation'])