def load_machines(self, service_name): self.entities_usage = EntityUsage.initialize_entities_usage( self.worker._mongo_client, service_name) for resource, machine in six.iteritems(self._machines): current_xpu_usage = Capacity() keygr = 'gpu_resource:%s:%s' % (self.worker._service, resource) keycr = 'cpu_resource:%s:%s' % (self.worker._service, resource) gpu_tasks = self.worker._redis.hgetall(keygr) cpu_tasks = self.worker._redis.hgetall(keycr) # can not launch multiple tasks on service with no multi-tasking (ec2) # or launch multiple tasks on service with hybrid task mode and dynamic resource mode (nova) if not _is_resource_multitask( service, resource) and (gpu_tasks or cpu_tasks): continue tmp_tasks = {} for _, v in six.iteritems(gpu_tasks): if v not in tmp_tasks: task_entity = task.get_owner_entity( self.worker._redis, v) tmp_tasks[v] = task_entity else: task_entity = tmp_tasks[v] if v not in self.preallocated_task_resource: self.preallocated_task_resource[v] = resource self._machines[resource].add_task( v, self.worker._redis) current_xpu_usage.incr_ngpus(1) self.entities_usage[task_entity].add_current_usage( Capacity(ngpus=1)) for _, v in six.iteritems(cpu_tasks): if v not in tmp_tasks: task_entity = task.get_owner_entity( self.worker._redis, v) tmp_tasks[v] = task_entity else: task_entity = tmp_tasks[v] if v not in self.preallocated_task_resource: self.preallocated_task_resource[v] = resource self._machines[resource].add_task( v, self.worker._redis) current_xpu_usage.incr_ncpus(1) self.entities_usage[task_entity].add_current_usage( Capacity(ncpus=1)) available_xpus = machine._init_capacity - current_xpu_usage self._machines[resource].set_available(available_xpus) self.worker._logger.debug("\tresource %s: - free %s", resource, available_xpus) return len(resource_mgr._machines) > 0
def _allocate_resource(self, task_id, request_resource, service, task_expected_capacity): """Allocates a resource for task_id and returns the name of the resource (or None if none where allocated), and the number of allocated gpus/cpus """ task_entity = task.get_owner_entity(self._redis, task_id) resources = service.list_resources() # Distribute resource by type only_cpus_task_machines, only_gpus_task_machines, mix_task_machines = self._split_machines_by_task_support( resources=resources, service=service) is_required_gpu_task = self._is_required_gpu_task( task_expected_capacity) if is_required_gpu_task: best_resource = self._distribute_machine_for_task( task_id, task_entity, task_expected_capacity, request_resource, service, { **only_gpus_task_machines, **mix_task_machines }) else: best_resource = self._distribute_machine_for_task( task_id, task_entity, task_expected_capacity, request_resource, service, only_cpus_task_machines) if not best_resource: best_resource = self._distribute_machine_for_task( task_id, task_entity, task_expected_capacity, request_resource, service, mix_task_machines) return best_resource
def _get_current_config(self, task_id): task_entity = task.get_owner_entity(self._redis, task_id) storages_entities_filter = task.get_storages_entity( self._redis, task_id) current_config = config.get_entity_cfg_from_redis( self._redis, self._service, storages_entities_filter, task_entity) return current_config
def try_create(next_task_id): next_keyt = 'task:%s' % next_task_id parent = self._redis.hget(next_keyt, 'parent') task_entity = task.get_owner_entity(self._redis, next_task_id) if task_entity not in resource_mgr.entities_usage: self._logger.error( "\t[Task %s] entity %s - without usage limit !", next_task_id, task_entity) return None # check parent dependency if parent: keyp = 'task:%s' % parent if self._redis.exists(keyp): # if the parent task is in the database, check for dependencies parent_status = self._redis.hget(keyp, 'status') if parent_status != 'stopped': if parent_status == 'running': # parent is still running so update queued time to be as close # as possible to terminate time of parent task self._redis.hset(next_keyt, "queued_time", time.time()) return None if self._redis.hget(keyp, 'message') != 'completed': task.terminate(self._redis, next_task_id, phase='dependency_error') return None task_capacity = Capacity(self._redis.hget(next_keyt, 'ngpus'), self._redis.hget(next_keyt, 'ncpus')) candidate_task = CandidateTask( next_task_id, task_entity, self._redis, task_capacity, resource_mgr.entities_usage[task_entity], self._logger) # check now the task has a chance to be processed by any machine for _, machine in six.iteritems(resource_mgr._machines): can_be_processed = machine._is_authorized(candidate_task._entity, candidate_task._capacity) \ and candidate_task._capacity.inf_or_eq(machine._init_capacity) if can_be_processed: return candidate_task return None