def stand_up_cluster():
        class Instances(FakeObject):
            def add(self, *args):
                for instance in args:
                    self.obj.instance_list.append(instance)

            def all(self):
                return self.obj.instance_list

        class InstanceGroup(FakeObject):
            def __init__(self, **kwargs):
                super(InstanceGroup, self).__init__(**kwargs)
                self.instance_list = []

            @property
            def instances(self):
                mgr = Instances(obj=self)
                return mgr

        class Instance(FakeObject):
            pass

        ig_small = InstanceGroup(name='ig_small')
        ig_large = InstanceGroup(name='ig_large')
        tower = InstanceGroup(name='tower')
        i1 = Instance(hostname='i1', capacity=200)
        i2 = Instance(hostname='i2', capacity=200)
        i3 = Instance(hostname='i3', capacity=200)
        ig_small.instances.add(i1)
        ig_large.instances.add(i2, i3)
        tower.instances.add(i2)
        return [tower, ig_large, ig_small]
Esempio n. 2
0
    def test_fit_task_to_most_remaining_capacity_instance(
            self, task, instances, instance_fit_index, reason):
        ig = InstanceGroup(id=10)

        instance_picked = ig.fit_task_to_most_remaining_capacity_instance(
            task, instances)

        if instance_fit_index is None:
            assert instance_picked is None, reason
        else:
            assert instance_picked == instances[instance_fit_index], reason
Esempio n. 3
0
    def process_pending_tasks(self, pending_tasks):
        running_workflow_templates = set([wf.unified_job_template_id for wf in self.get_running_workflow_jobs()])
        for task in pending_tasks:
            if self.start_task_limit <= 0:
                break
            if self.is_job_blocked(task):
                logger.debug("{} is blocked from running".format(task.log_format))
                continue
            preferred_instance_groups = task.preferred_instance_groups
            found_acceptable_queue = False
            if isinstance(task, WorkflowJob):
                if task.unified_job_template_id in running_workflow_templates:
                    if not task.allow_simultaneous:
                        logger.debug("{} is blocked from running, workflow already running".format(task.log_format))
                        continue
                else:
                    running_workflow_templates.add(task.unified_job_template_id)
                self.start_task(task, None, task.get_jobs_fail_chain(), None)
                continue
            for rampart_group in preferred_instance_groups:
                if task.can_run_containerized and rampart_group.is_containerized:
                    self.graph[rampart_group.name]['graph'].add_job(task)
                    self.start_task(task, rampart_group, task.get_jobs_fail_chain(), None)
                    found_acceptable_queue = True
                    break

                remaining_capacity = self.get_remaining_capacity(rampart_group.name)
                if not rampart_group.is_containerized and self.get_remaining_capacity(rampart_group.name) <= 0:
                    logger.debug("Skipping group {}, remaining_capacity {} <= 0".format(
                                 rampart_group.name, remaining_capacity))
                    continue

                execution_instance = InstanceGroup.fit_task_to_most_remaining_capacity_instance(task, self.graph[rampart_group.name]['instances']) or \
                    InstanceGroup.find_largest_idle_instance(self.graph[rampart_group.name]['instances'])

                if execution_instance or rampart_group.is_containerized:
                    if not rampart_group.is_containerized:
                        execution_instance.remaining_capacity = max(0, execution_instance.remaining_capacity - task.task_impact)
                        execution_instance.jobs_running += 1
                        logger.debug("Starting {} in group {} instance {} (remaining_capacity={})".format(
                                     task.log_format, rampart_group.name, execution_instance.hostname, remaining_capacity))

                    execution_instance = self.real_instances[execution_instance.hostname]
                    self.graph[rampart_group.name]['graph'].add_job(task)
                    self.start_task(task, rampart_group, task.get_jobs_fail_chain(), execution_instance)
                    found_acceptable_queue = True
                    break
                else:
                    logger.debug("No instance available in group {} to run job {} w/ capacity requirement {}".format(
                                 rampart_group.name, task.log_format, task.task_impact))
            if not found_acceptable_queue:
                logger.debug("{} couldn't be scheduled on graph, waiting for next cycle".format(task.log_format))
Esempio n. 4
0
    def test_fit_task_to_most_remaining_capacity_instance(self, task, instances, instance_fit_index, reason):
        with mock.patch.object(InstanceGroup,
                               'instances',
                               Mock(spec_set=['filter'],
                                    filter=lambda *args, **kargs: Mock(spec_set=['order_by'],
                                                                       order_by=lambda x: instances))):
            ig = InstanceGroup(id=10)

            if instance_fit_index is None:
                assert ig.fit_task_to_most_remaining_capacity_instance(task) is None, reason
            else:
                assert ig.fit_task_to_most_remaining_capacity_instance(task) == \
                    instances[instance_fit_index], reason
Esempio n. 5
0
    def test_find_largest_idle_instance(self, instances, instance_fit_index,
                                        reason):
        def filter_offline_instances(*args):
            return filter(lambda i: i.capacity > 0, instances)

        ig = InstanceGroup(id=10)
        instances_online_only = filter_offline_instances(instances)

        if instance_fit_index is None:
            assert ig.find_largest_idle_instance(
                instances_online_only) is None, reason
        else:
            assert ig.find_largest_idle_instance(
                instances_online_only) == instances[instance_fit_index], reason
def test_containerized_group_default_fields(instance_group, kube_credential):
    ig = InstanceGroup(name="test_policy_field_defaults")
    ig.policy_instance_list = [1]
    ig.policy_instance_minimum = 5
    ig.policy_instance_percentage = 5
    ig.save()
    assert ig.policy_instance_list == [1]
    assert ig.policy_instance_minimum == 5
    assert ig.policy_instance_percentage == 5
    ig.credential = kube_credential
    ig.save()
    assert ig.policy_instance_list == []
    assert ig.policy_instance_minimum == 0
    assert ig.policy_instance_percentage == 0
Esempio n. 7
0
 def handle(self, **options):
     queuename = options.get('queuename')
     if not queuename:
         raise CommandError("Specify `--queuename` to use this command.")
     changed = False
     with advisory_lock('instance_group_registration_%s' % queuename):
         ig = InstanceGroup.objects.filter(name=queuename)
         control_ig = None
         if options.get('controller'):
             control_ig = InstanceGroup.objects.filter(
                 name=options.get('controller')).first()
         if ig.exists():
             print("Instance Group already registered {}".format(
                 ig[0].name))
             ig = ig[0]
             if control_ig and ig.controller_id != control_ig.pk:
                 ig.controller = control_ig
                 ig.save()
                 print("Set controller group {} on {}.".format(
                     control_ig.name, ig.name))
                 changed = True
         else:
             print("Creating instance group {}".format(queuename))
             ig = InstanceGroup(name=queuename)
             if control_ig:
                 ig.controller = control_ig
             ig.save()
             changed = True
         hostname_list = []
         if options.get('hostnames'):
             hostname_list = options.get('hostnames').split(",")
         instance_list = [x.strip() for x in hostname_list if x]
         for inst_name in instance_list:
             instance = Instance.objects.filter(hostname=inst_name)
             if instance.exists() and instance[0] not in ig.instances.all():
                 ig.instances.add(instance[0])
                 print("Added instance {} to {}".format(
                     instance[0].hostname, ig.name))
                 changed = True
             elif not instance.exists():
                 print("Instance does not exist: {}".format(inst_name))
                 if changed:
                     print('(changed: True)')
                 sys.exit(1)
             else:
                 print("Instance already registered {}".format(
                     instance[0].hostname))
         if changed:
             print('(changed: True)')
Esempio n. 8
0
    def test_find_largest_idle_instance(self, instances, instance_fit_index, reason):
        def filter_offline_instances(*args):
            return filter(lambda i: i.capacity > 0, instances)

        with mock.patch.object(InstanceGroup,
                               'instances',
                               Mock(spec_set=['filter'],
                                    filter=lambda *args, **kargs: Mock(spec_set=['order_by'],
                                                                       order_by=filter_offline_instances))):
            ig = InstanceGroup(id=10)

            if instance_fit_index is None:
                assert ig.find_largest_idle_instance() is None, reason
            else:
                assert ig.find_largest_idle_instance() == \
                    instances[instance_fit_index], reason
Esempio n. 9
0
    def test_fit_task_to_most_remaining_capacity_instance(self, task, instances, instance_fit_index, reason):
        InstanceGroup(id=10)
        tm_igs = TaskManagerInstanceGroups(instance_groups={'controlplane': {'instances': instances}})

        instance_picked = tm_igs.fit_task_to_most_remaining_capacity_instance(task, 'controlplane')

        if instance_fit_index is None:
            assert instance_picked is None, reason
        else:
            assert instance_picked == instances[instance_fit_index], reason
Esempio n. 10
0
    def test_find_largest_idle_instance(self, instances, instance_fit_index, reason):
        def filter_offline_instances(*args):
            return filter(lambda i: i.capacity > 0, instances)

        InstanceGroup(id=10)
        instances_online_only = filter_offline_instances(instances)
        tm_igs = TaskManagerInstanceGroups(instance_groups={'controlplane': {'instances': instances_online_only}})

        if instance_fit_index is None:
            assert tm_igs.find_largest_idle_instance('controlplane') is None, reason
        else:
            assert tm_igs.find_largest_idle_instance('controlplane') == instances[instance_fit_index], reason
Esempio n. 11
0
def tower_instance_group():
    ig = InstanceGroup(name='tower')
    ig.save()
    return ig
Esempio n. 12
0
    def process_pending_tasks(self, pending_tasks):
        running_workflow_templates = set([wf.unified_job_template_id for wf in self.get_running_workflow_jobs()])
        tasks_to_update_job_explanation = []
        for task in pending_tasks:
            if self.start_task_limit <= 0:
                break
            blocked_by = self.job_blocked_by(task)
            if blocked_by:
                task.log_lifecycle("blocked", blocked_by=blocked_by)
                job_explanation = gettext_noop(f"waiting for {blocked_by._meta.model_name}-{blocked_by.id} to finish")
                if task.job_explanation != job_explanation:
                    if task.created < (tz_now() - self.time_delta_job_explanation):
                        task.job_explanation = job_explanation
                        tasks_to_update_job_explanation.append(task)
                continue
            preferred_instance_groups = task.preferred_instance_groups

            found_acceptable_queue = False
            if isinstance(task, WorkflowJob):
                if task.unified_job_template_id in running_workflow_templates:
                    if not task.allow_simultaneous:
                        logger.debug("{} is blocked from running, workflow already running".format(task.log_format))
                        continue
                else:
                    running_workflow_templates.add(task.unified_job_template_id)
                self.start_task(task, None, task.get_jobs_fail_chain(), None)
                continue

            for rampart_group in preferred_instance_groups:
                if task.capacity_type == 'execution' and rampart_group.is_container_group:
                    self.graph[rampart_group.name]['graph'].add_job(task)
                    self.start_task(task, rampart_group, task.get_jobs_fail_chain(), None)
                    found_acceptable_queue = True
                    break

                # TODO: remove this after we have confidence that OCP control nodes are reporting node_type=control
                if settings.IS_K8S and task.capacity_type == 'execution':
                    logger.debug("Skipping group {}, task cannot run on control plane".format(rampart_group.name))
                    continue

                remaining_capacity = self.get_remaining_capacity(rampart_group.name, capacity_type=task.capacity_type)
                if task.task_impact > 0 and remaining_capacity <= 0:
                    logger.debug("Skipping group {}, remaining_capacity {} <= 0".format(rampart_group.name, remaining_capacity))
                    continue

                execution_instance = InstanceGroup.fit_task_to_most_remaining_capacity_instance(
                    task, self.graph[rampart_group.name]['instances']
                ) or InstanceGroup.find_largest_idle_instance(self.graph[rampart_group.name]['instances'], capacity_type=task.capacity_type)

                if execution_instance or rampart_group.is_container_group:
                    if not rampart_group.is_container_group:
                        execution_instance.remaining_capacity = max(0, execution_instance.remaining_capacity - task.task_impact)
                        execution_instance.jobs_running += 1
                        logger.debug(
                            "Starting {} in group {} instance {} (remaining_capacity={})".format(
                                task.log_format, rampart_group.name, execution_instance.hostname, remaining_capacity
                            )
                        )

                    if execution_instance:
                        execution_instance = self.real_instances[execution_instance.hostname]
                    self.graph[rampart_group.name]['graph'].add_job(task)
                    self.start_task(task, rampart_group, task.get_jobs_fail_chain(), execution_instance)
                    found_acceptable_queue = True
                    break
                else:
                    logger.debug(
                        "No instance available in group {} to run job {} w/ capacity requirement {}".format(
                            rampart_group.name, task.log_format, task.task_impact
                        )
                    )
            if not found_acceptable_queue:
                task.log_lifecycle("needs_capacity")
                job_explanation = gettext_noop("This job is not ready to start because there is not enough available capacity.")
                if task.job_explanation != job_explanation:
                    if task.created < (tz_now() - self.time_delta_job_explanation):
                        # Many launched jobs are immediately blocked, but most blocks will resolve in a few seconds.
                        # Therefore we should only update the job_explanation after some time has elapsed to
                        # prevent excessive task saves.
                        task.job_explanation = job_explanation
                        tasks_to_update_job_explanation.append(task)
                logger.debug("{} couldn't be scheduled on graph, waiting for next cycle".format(task.log_format))
        UnifiedJob.objects.bulk_update(tasks_to_update_job_explanation, ['job_explanation'])
Esempio n. 13
0
def containerized_instance_group(instance_group, kube_credential):
    ig = InstanceGroup(name="container")
    ig.credential = kube_credential
    ig.is_container_group = True
    ig.save()
    return ig
Esempio n. 14
0
    def process_pending_tasks(self, pending_tasks):
        running_workflow_templates = {wf.unified_job_template_id for wf in self.get_running_workflow_jobs()}
        tasks_to_update_job_explanation = []
        for task in pending_tasks:
            if self.start_task_limit <= 0:
                break
            blocked_by = self.job_blocked_by(task)
            if blocked_by:
                task.log_lifecycle("blocked", blocked_by=blocked_by)
                job_explanation = gettext_noop(f"waiting for {blocked_by._meta.model_name}-{blocked_by.id} to finish")
                if task.job_explanation != job_explanation:
                    if task.created < (tz_now() - self.time_delta_job_explanation):
                        task.job_explanation = job_explanation
                        tasks_to_update_job_explanation.append(task)
                continue

            found_acceptable_queue = False
            preferred_instance_groups = task.preferred_instance_groups

            if isinstance(task, WorkflowJob):
                if task.unified_job_template_id in running_workflow_templates:
                    if not task.allow_simultaneous:
                        logger.debug("{} is blocked from running, workflow already running".format(task.log_format))
                        continue
                else:
                    running_workflow_templates.add(task.unified_job_template_id)
                self.start_task(task, None, task.get_jobs_fail_chain(), None)
                continue

            # Determine if there is control capacity for the task
            if task.capacity_type == 'control':
                control_impact = task.task_impact + settings.AWX_CONTROL_NODE_TASK_IMPACT
            else:
                control_impact = settings.AWX_CONTROL_NODE_TASK_IMPACT
            control_instance = InstanceGroup.fit_task_to_most_remaining_capacity_instance(
                task, self.graph[settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME]['instances'], impact=control_impact, capacity_type='control'
            )
            if not control_instance:
                self.task_needs_capacity(task, tasks_to_update_job_explanation)
                logger.debug(f"Skipping task {task.log_format} in pending, not enough capacity left on controlplane to control new tasks")
                continue

            task.controller_node = control_instance.hostname

            # All task.capacity_type == 'control' jobs should run on control plane, no need to loop over instance groups
            if task.capacity_type == 'control':
                task.execution_node = control_instance.hostname
                control_instance.remaining_capacity = max(0, control_instance.remaining_capacity - control_impact)
                control_instance.jobs_running += 1
                self.dependency_graph.add_job(task)
                execution_instance = self.real_instances[control_instance.hostname]
                task.log_lifecycle("controller_node_chosen")
                task.log_lifecycle("execution_node_chosen")
                self.start_task(task, self.controlplane_ig, task.get_jobs_fail_chain(), execution_instance)
                found_acceptable_queue = True
                continue

            for rampart_group in preferred_instance_groups:
                if rampart_group.is_container_group:
                    control_instance.jobs_running += 1
                    self.dependency_graph.add_job(task)
                    self.start_task(task, rampart_group, task.get_jobs_fail_chain(), None)
                    found_acceptable_queue = True
                    break

                # TODO: remove this after we have confidence that OCP control nodes are reporting node_type=control
                if settings.IS_K8S and task.capacity_type == 'execution':
                    logger.debug("Skipping group {}, task cannot run on control plane".format(rampart_group.name))
                    continue
                # at this point we know the instance group is NOT a container group
                # because if it was, it would have started the task and broke out of the loop.
                execution_instance = InstanceGroup.fit_task_to_most_remaining_capacity_instance(
                    task, self.graph[rampart_group.name]['instances'], add_hybrid_control_cost=True
                ) or InstanceGroup.find_largest_idle_instance(self.graph[rampart_group.name]['instances'], capacity_type=task.capacity_type)

                if execution_instance:
                    task.execution_node = execution_instance.hostname
                    # If our execution instance is a hybrid, prefer to do control tasks there as well.
                    if execution_instance.node_type == 'hybrid':
                        control_instance = execution_instance
                        task.controller_node = execution_instance.hostname

                    control_instance.remaining_capacity = max(0, control_instance.remaining_capacity - settings.AWX_CONTROL_NODE_TASK_IMPACT)
                    task.log_lifecycle("controller_node_chosen")
                    if control_instance != execution_instance:
                        control_instance.jobs_running += 1
                    execution_instance.remaining_capacity = max(0, execution_instance.remaining_capacity - task.task_impact)
                    execution_instance.jobs_running += 1
                    task.log_lifecycle("execution_node_chosen")
                    logger.debug(
                        "Starting {} in group {} instance {} (remaining_capacity={})".format(
                            task.log_format, rampart_group.name, execution_instance.hostname, execution_instance.remaining_capacity
                        )
                    )
                    execution_instance = self.real_instances[execution_instance.hostname]
                    self.dependency_graph.add_job(task)
                    self.start_task(task, rampart_group, task.get_jobs_fail_chain(), execution_instance)
                    found_acceptable_queue = True
                    break
                else:
                    logger.debug(
                        "No instance available in group {} to run job {} w/ capacity requirement {}".format(
                            rampart_group.name, task.log_format, task.task_impact
                        )
                    )
            if not found_acceptable_queue:
                self.task_needs_capacity(task, tasks_to_update_job_explanation)
        UnifiedJob.objects.bulk_update(tasks_to_update_job_explanation, ['job_explanation'])
Esempio n. 15
0
def isolated_instance_group(instance_group):
    ig = InstanceGroup(name="iso", controller=instance_group)
    ig.save()
    return ig
Esempio n. 16
0
def instance_group(job_factory):
    ig = InstanceGroup(name="east")
    ig.save()
    return ig
Esempio n. 17
0
def isolated_instance_group(instance_group, instance):
    ig = InstanceGroup(name="iso", controller=instance_group)
    ig.save()
    ig.instances.set([instance])
    ig.save()
    return ig
Esempio n. 18
0
def container_group():
    instance_group = mock.Mock(InstanceGroup(name='container-group'))

    return instance_group