def stand_up_cluster(): class Instances(FakeObject): def add(self, *args): for instance in args: self.obj.instance_list.append(instance) def all(self): return self.obj.instance_list class InstanceGroup(FakeObject): def __init__(self, **kwargs): super(InstanceGroup, self).__init__(**kwargs) self.instance_list = [] @property def instances(self): mgr = Instances(obj=self) return mgr class Instance(FakeObject): pass ig_small = InstanceGroup(name='ig_small') ig_large = InstanceGroup(name='ig_large') tower = InstanceGroup(name='tower') i1 = Instance(hostname='i1', capacity=200) i2 = Instance(hostname='i2', capacity=200) i3 = Instance(hostname='i3', capacity=200) ig_small.instances.add(i1) ig_large.instances.add(i2, i3) tower.instances.add(i2) return [tower, ig_large, ig_small]
def test_fit_task_to_most_remaining_capacity_instance( self, task, instances, instance_fit_index, reason): ig = InstanceGroup(id=10) instance_picked = ig.fit_task_to_most_remaining_capacity_instance( task, instances) if instance_fit_index is None: assert instance_picked is None, reason else: assert instance_picked == instances[instance_fit_index], reason
def process_pending_tasks(self, pending_tasks): running_workflow_templates = set([wf.unified_job_template_id for wf in self.get_running_workflow_jobs()]) for task in pending_tasks: if self.start_task_limit <= 0: break if self.is_job_blocked(task): logger.debug("{} is blocked from running".format(task.log_format)) continue preferred_instance_groups = task.preferred_instance_groups found_acceptable_queue = False if isinstance(task, WorkflowJob): if task.unified_job_template_id in running_workflow_templates: if not task.allow_simultaneous: logger.debug("{} is blocked from running, workflow already running".format(task.log_format)) continue else: running_workflow_templates.add(task.unified_job_template_id) self.start_task(task, None, task.get_jobs_fail_chain(), None) continue for rampart_group in preferred_instance_groups: if task.can_run_containerized and rampart_group.is_containerized: self.graph[rampart_group.name]['graph'].add_job(task) self.start_task(task, rampart_group, task.get_jobs_fail_chain(), None) found_acceptable_queue = True break remaining_capacity = self.get_remaining_capacity(rampart_group.name) if not rampart_group.is_containerized and self.get_remaining_capacity(rampart_group.name) <= 0: logger.debug("Skipping group {}, remaining_capacity {} <= 0".format( rampart_group.name, remaining_capacity)) continue execution_instance = InstanceGroup.fit_task_to_most_remaining_capacity_instance(task, self.graph[rampart_group.name]['instances']) or \ InstanceGroup.find_largest_idle_instance(self.graph[rampart_group.name]['instances']) if execution_instance or rampart_group.is_containerized: if not rampart_group.is_containerized: execution_instance.remaining_capacity = max(0, execution_instance.remaining_capacity - task.task_impact) execution_instance.jobs_running += 1 logger.debug("Starting {} in group {} instance {} (remaining_capacity={})".format( task.log_format, rampart_group.name, execution_instance.hostname, remaining_capacity)) execution_instance = self.real_instances[execution_instance.hostname] self.graph[rampart_group.name]['graph'].add_job(task) self.start_task(task, rampart_group, task.get_jobs_fail_chain(), execution_instance) found_acceptable_queue = True break else: logger.debug("No instance available in group {} to run job {} w/ capacity requirement {}".format( rampart_group.name, task.log_format, task.task_impact)) if not found_acceptable_queue: logger.debug("{} couldn't be scheduled on graph, waiting for next cycle".format(task.log_format))
def test_fit_task_to_most_remaining_capacity_instance(self, task, instances, instance_fit_index, reason): with mock.patch.object(InstanceGroup, 'instances', Mock(spec_set=['filter'], filter=lambda *args, **kargs: Mock(spec_set=['order_by'], order_by=lambda x: instances))): ig = InstanceGroup(id=10) if instance_fit_index is None: assert ig.fit_task_to_most_remaining_capacity_instance(task) is None, reason else: assert ig.fit_task_to_most_remaining_capacity_instance(task) == \ instances[instance_fit_index], reason
def test_find_largest_idle_instance(self, instances, instance_fit_index, reason): def filter_offline_instances(*args): return filter(lambda i: i.capacity > 0, instances) ig = InstanceGroup(id=10) instances_online_only = filter_offline_instances(instances) if instance_fit_index is None: assert ig.find_largest_idle_instance( instances_online_only) is None, reason else: assert ig.find_largest_idle_instance( instances_online_only) == instances[instance_fit_index], reason
def test_containerized_group_default_fields(instance_group, kube_credential): ig = InstanceGroup(name="test_policy_field_defaults") ig.policy_instance_list = [1] ig.policy_instance_minimum = 5 ig.policy_instance_percentage = 5 ig.save() assert ig.policy_instance_list == [1] assert ig.policy_instance_minimum == 5 assert ig.policy_instance_percentage == 5 ig.credential = kube_credential ig.save() assert ig.policy_instance_list == [] assert ig.policy_instance_minimum == 0 assert ig.policy_instance_percentage == 0
def handle(self, **options): queuename = options.get('queuename') if not queuename: raise CommandError("Specify `--queuename` to use this command.") changed = False with advisory_lock('instance_group_registration_%s' % queuename): ig = InstanceGroup.objects.filter(name=queuename) control_ig = None if options.get('controller'): control_ig = InstanceGroup.objects.filter( name=options.get('controller')).first() if ig.exists(): print("Instance Group already registered {}".format( ig[0].name)) ig = ig[0] if control_ig and ig.controller_id != control_ig.pk: ig.controller = control_ig ig.save() print("Set controller group {} on {}.".format( control_ig.name, ig.name)) changed = True else: print("Creating instance group {}".format(queuename)) ig = InstanceGroup(name=queuename) if control_ig: ig.controller = control_ig ig.save() changed = True hostname_list = [] if options.get('hostnames'): hostname_list = options.get('hostnames').split(",") instance_list = [x.strip() for x in hostname_list if x] for inst_name in instance_list: instance = Instance.objects.filter(hostname=inst_name) if instance.exists() and instance[0] not in ig.instances.all(): ig.instances.add(instance[0]) print("Added instance {} to {}".format( instance[0].hostname, ig.name)) changed = True elif not instance.exists(): print("Instance does not exist: {}".format(inst_name)) if changed: print('(changed: True)') sys.exit(1) else: print("Instance already registered {}".format( instance[0].hostname)) if changed: print('(changed: True)')
def test_find_largest_idle_instance(self, instances, instance_fit_index, reason): def filter_offline_instances(*args): return filter(lambda i: i.capacity > 0, instances) with mock.patch.object(InstanceGroup, 'instances', Mock(spec_set=['filter'], filter=lambda *args, **kargs: Mock(spec_set=['order_by'], order_by=filter_offline_instances))): ig = InstanceGroup(id=10) if instance_fit_index is None: assert ig.find_largest_idle_instance() is None, reason else: assert ig.find_largest_idle_instance() == \ instances[instance_fit_index], reason
def test_fit_task_to_most_remaining_capacity_instance(self, task, instances, instance_fit_index, reason): InstanceGroup(id=10) tm_igs = TaskManagerInstanceGroups(instance_groups={'controlplane': {'instances': instances}}) instance_picked = tm_igs.fit_task_to_most_remaining_capacity_instance(task, 'controlplane') if instance_fit_index is None: assert instance_picked is None, reason else: assert instance_picked == instances[instance_fit_index], reason
def test_find_largest_idle_instance(self, instances, instance_fit_index, reason): def filter_offline_instances(*args): return filter(lambda i: i.capacity > 0, instances) InstanceGroup(id=10) instances_online_only = filter_offline_instances(instances) tm_igs = TaskManagerInstanceGroups(instance_groups={'controlplane': {'instances': instances_online_only}}) if instance_fit_index is None: assert tm_igs.find_largest_idle_instance('controlplane') is None, reason else: assert tm_igs.find_largest_idle_instance('controlplane') == instances[instance_fit_index], reason
def tower_instance_group(): ig = InstanceGroup(name='tower') ig.save() return ig
def process_pending_tasks(self, pending_tasks): running_workflow_templates = set([wf.unified_job_template_id for wf in self.get_running_workflow_jobs()]) tasks_to_update_job_explanation = [] for task in pending_tasks: if self.start_task_limit <= 0: break blocked_by = self.job_blocked_by(task) if blocked_by: task.log_lifecycle("blocked", blocked_by=blocked_by) job_explanation = gettext_noop(f"waiting for {blocked_by._meta.model_name}-{blocked_by.id} to finish") if task.job_explanation != job_explanation: if task.created < (tz_now() - self.time_delta_job_explanation): task.job_explanation = job_explanation tasks_to_update_job_explanation.append(task) continue preferred_instance_groups = task.preferred_instance_groups found_acceptable_queue = False if isinstance(task, WorkflowJob): if task.unified_job_template_id in running_workflow_templates: if not task.allow_simultaneous: logger.debug("{} is blocked from running, workflow already running".format(task.log_format)) continue else: running_workflow_templates.add(task.unified_job_template_id) self.start_task(task, None, task.get_jobs_fail_chain(), None) continue for rampart_group in preferred_instance_groups: if task.capacity_type == 'execution' and rampart_group.is_container_group: self.graph[rampart_group.name]['graph'].add_job(task) self.start_task(task, rampart_group, task.get_jobs_fail_chain(), None) found_acceptable_queue = True break # TODO: remove this after we have confidence that OCP control nodes are reporting node_type=control if settings.IS_K8S and task.capacity_type == 'execution': logger.debug("Skipping group {}, task cannot run on control plane".format(rampart_group.name)) continue remaining_capacity = self.get_remaining_capacity(rampart_group.name, capacity_type=task.capacity_type) if task.task_impact > 0 and remaining_capacity <= 0: logger.debug("Skipping group {}, remaining_capacity {} <= 0".format(rampart_group.name, remaining_capacity)) continue execution_instance = InstanceGroup.fit_task_to_most_remaining_capacity_instance( task, self.graph[rampart_group.name]['instances'] ) or InstanceGroup.find_largest_idle_instance(self.graph[rampart_group.name]['instances'], capacity_type=task.capacity_type) if execution_instance or rampart_group.is_container_group: if not rampart_group.is_container_group: execution_instance.remaining_capacity = max(0, execution_instance.remaining_capacity - task.task_impact) execution_instance.jobs_running += 1 logger.debug( "Starting {} in group {} instance {} (remaining_capacity={})".format( task.log_format, rampart_group.name, execution_instance.hostname, remaining_capacity ) ) if execution_instance: execution_instance = self.real_instances[execution_instance.hostname] self.graph[rampart_group.name]['graph'].add_job(task) self.start_task(task, rampart_group, task.get_jobs_fail_chain(), execution_instance) found_acceptable_queue = True break else: logger.debug( "No instance available in group {} to run job {} w/ capacity requirement {}".format( rampart_group.name, task.log_format, task.task_impact ) ) if not found_acceptable_queue: task.log_lifecycle("needs_capacity") job_explanation = gettext_noop("This job is not ready to start because there is not enough available capacity.") if task.job_explanation != job_explanation: if task.created < (tz_now() - self.time_delta_job_explanation): # Many launched jobs are immediately blocked, but most blocks will resolve in a few seconds. # Therefore we should only update the job_explanation after some time has elapsed to # prevent excessive task saves. task.job_explanation = job_explanation tasks_to_update_job_explanation.append(task) logger.debug("{} couldn't be scheduled on graph, waiting for next cycle".format(task.log_format)) UnifiedJob.objects.bulk_update(tasks_to_update_job_explanation, ['job_explanation'])
def containerized_instance_group(instance_group, kube_credential): ig = InstanceGroup(name="container") ig.credential = kube_credential ig.is_container_group = True ig.save() return ig
def process_pending_tasks(self, pending_tasks): running_workflow_templates = {wf.unified_job_template_id for wf in self.get_running_workflow_jobs()} tasks_to_update_job_explanation = [] for task in pending_tasks: if self.start_task_limit <= 0: break blocked_by = self.job_blocked_by(task) if blocked_by: task.log_lifecycle("blocked", blocked_by=blocked_by) job_explanation = gettext_noop(f"waiting for {blocked_by._meta.model_name}-{blocked_by.id} to finish") if task.job_explanation != job_explanation: if task.created < (tz_now() - self.time_delta_job_explanation): task.job_explanation = job_explanation tasks_to_update_job_explanation.append(task) continue found_acceptable_queue = False preferred_instance_groups = task.preferred_instance_groups if isinstance(task, WorkflowJob): if task.unified_job_template_id in running_workflow_templates: if not task.allow_simultaneous: logger.debug("{} is blocked from running, workflow already running".format(task.log_format)) continue else: running_workflow_templates.add(task.unified_job_template_id) self.start_task(task, None, task.get_jobs_fail_chain(), None) continue # Determine if there is control capacity for the task if task.capacity_type == 'control': control_impact = task.task_impact + settings.AWX_CONTROL_NODE_TASK_IMPACT else: control_impact = settings.AWX_CONTROL_NODE_TASK_IMPACT control_instance = InstanceGroup.fit_task_to_most_remaining_capacity_instance( task, self.graph[settings.DEFAULT_CONTROL_PLANE_QUEUE_NAME]['instances'], impact=control_impact, capacity_type='control' ) if not control_instance: self.task_needs_capacity(task, tasks_to_update_job_explanation) logger.debug(f"Skipping task {task.log_format} in pending, not enough capacity left on controlplane to control new tasks") continue task.controller_node = control_instance.hostname # All task.capacity_type == 'control' jobs should run on control plane, no need to loop over instance groups if task.capacity_type == 'control': task.execution_node = control_instance.hostname control_instance.remaining_capacity = max(0, control_instance.remaining_capacity - control_impact) control_instance.jobs_running += 1 self.dependency_graph.add_job(task) execution_instance = self.real_instances[control_instance.hostname] task.log_lifecycle("controller_node_chosen") task.log_lifecycle("execution_node_chosen") self.start_task(task, self.controlplane_ig, task.get_jobs_fail_chain(), execution_instance) found_acceptable_queue = True continue for rampart_group in preferred_instance_groups: if rampart_group.is_container_group: control_instance.jobs_running += 1 self.dependency_graph.add_job(task) self.start_task(task, rampart_group, task.get_jobs_fail_chain(), None) found_acceptable_queue = True break # TODO: remove this after we have confidence that OCP control nodes are reporting node_type=control if settings.IS_K8S and task.capacity_type == 'execution': logger.debug("Skipping group {}, task cannot run on control plane".format(rampart_group.name)) continue # at this point we know the instance group is NOT a container group # because if it was, it would have started the task and broke out of the loop. execution_instance = InstanceGroup.fit_task_to_most_remaining_capacity_instance( task, self.graph[rampart_group.name]['instances'], add_hybrid_control_cost=True ) or InstanceGroup.find_largest_idle_instance(self.graph[rampart_group.name]['instances'], capacity_type=task.capacity_type) if execution_instance: task.execution_node = execution_instance.hostname # If our execution instance is a hybrid, prefer to do control tasks there as well. if execution_instance.node_type == 'hybrid': control_instance = execution_instance task.controller_node = execution_instance.hostname control_instance.remaining_capacity = max(0, control_instance.remaining_capacity - settings.AWX_CONTROL_NODE_TASK_IMPACT) task.log_lifecycle("controller_node_chosen") if control_instance != execution_instance: control_instance.jobs_running += 1 execution_instance.remaining_capacity = max(0, execution_instance.remaining_capacity - task.task_impact) execution_instance.jobs_running += 1 task.log_lifecycle("execution_node_chosen") logger.debug( "Starting {} in group {} instance {} (remaining_capacity={})".format( task.log_format, rampart_group.name, execution_instance.hostname, execution_instance.remaining_capacity ) ) execution_instance = self.real_instances[execution_instance.hostname] self.dependency_graph.add_job(task) self.start_task(task, rampart_group, task.get_jobs_fail_chain(), execution_instance) found_acceptable_queue = True break else: logger.debug( "No instance available in group {} to run job {} w/ capacity requirement {}".format( rampart_group.name, task.log_format, task.task_impact ) ) if not found_acceptable_queue: self.task_needs_capacity(task, tasks_to_update_job_explanation) UnifiedJob.objects.bulk_update(tasks_to_update_job_explanation, ['job_explanation'])
def isolated_instance_group(instance_group): ig = InstanceGroup(name="iso", controller=instance_group) ig.save() return ig
def instance_group(job_factory): ig = InstanceGroup(name="east") ig.save() return ig
def isolated_instance_group(instance_group, instance): ig = InstanceGroup(name="iso", controller=instance_group) ig.save() ig.instances.set([instance]) ig.save() return ig
def container_group(): instance_group = mock.Mock(InstanceGroup(name='container-group')) return instance_group