def _service_unqueue(self, service): """find the best next task to push to the work queue """ with self._redis.acquire_lock('service:'+service.name): queue = 'queued:%s' % service.name count = self._redis.llen(queue) idx = 0 # Pop a task waiting for a resource on this service, check if it can run (dependency) # and queue it for a retry. best_task_id = None best_task_priority = -10000 best_task_queued_time = 0 while count > 0: count -= 1 next_task_id = self._redis.lindex(queue, count) if next_task_id is not None: next_keyt = 'task:%s' % next_task_id parent = self._redis.hget(next_keyt, 'parent') priority = int(self._redis.hget(next_keyt, 'priority')) queued_time = float(self._redis.hget(next_keyt, 'queued_time')) if parent: keyp = 'task:%s' % parent if self._redis.exists(keyp): # if the parent task is in the database, check for dependencies if self._redis.hget(keyp, 'status') != 'stopped': continue if priority > best_task_priority or ( priority == best_task_priority and best_task_queued_time > queued_time): best_task_priority = priority best_task_id = next_task_id best_task_queued_time = queued_time if best_task_id: task.work_queue(self._redis, best_task_id, service.name) self._redis.lrem(queue, best_task_id)
def _handle_running_task(self, task_id): keyt = 'task:%s' % task_id _, service = self._get_service(keyt=keyt) self._logger.debug('- checking activity of task: %s', task_id) data = json.loads(self._redis.hget(keyt, 'job')) try: status = service.status(task_id, data) if status == 'dead': self._logger.info( '%s: task no longer running on %s, request termination', task_id, service.name) task.terminate(self._redis, task_id, phase='exited') else: task.work_queue(self._redis, task_id, service.name, delay=service.is_notifying_activity and 600 or 120) except Exception as e: self._logger.info('cannot get status for [%s] - %s', task_id, str(e)) self._redis.hincrby(keyt, 'status_fail', 1) if int(self._redis.hget(keyt, 'status_fail')) > 4: return task.terminate(self._redis, task_id, phase='lost_connection') self._redis.hdel(keyt, 'status_fail') return None
def reorganize_tasks(): logger.debug(f"[{service}-{pid}]: Reorganizing tasks") # On startup, add all active tasks in the work queue or service queue for task_id in task.list_active(redis, service): task_key = f'task:{task_id}' with redis.acquire_lock(task_id): status = redis.hget(task_key, 'status') if status in ['queued', 'allocated']: task.service_queue(redis, task_id, service) task.set_status(redis, 'task:' + task_id, 'queued') else: task.work_queue(redis, task_id, service) # check integrity of tasks if redis.hget(task_key, 'priority') is None: redis.hset(task_key, 'priority', 0) if redis.hget(task_key, 'queued_time') is None: redis.hset(task_key, 'queued_time', time.time())
def run(self): signal.signal(signal.SIGTERM, graceful_exit) signal.signal(signal.SIGINT, graceful_exit) pubsub = self._redis.pubsub() pubsub.psubscribe('__keyspace@0__:beat:*') pubsub.psubscribe('__keyspace@0__:queue:*') while True: message = pubsub.get_message() if message: channel = message['channel'] data = message['data'] if data == 'expired': # task expired, not beat was received if channel.startswith('__keyspace@0__:beat:'): task_id = channel[20:] service = self._redis.hget('task:' + task_id, 'service') if service in self._services: self._logger.info('%s: task expired', task_id) auth_token = self._redis.hget('task:%s' % task_id, 'token') callback_url = self._services[service]._config.get('callback_url') if auth_token: callback_url = callback_url.replace("://", "://" + auth_token + ":x@") r = requests.get(os.path.join(callback_url, "task/terminate", task_id), params={'phase': 'expired'}) if r.status_code != 200: self._logger.warning('incorrect result from \'task/terminate\' service: %s' % r.text) with self._redis.acquire_lock(task_id): task.terminate(self._redis, task_id, phase='expired') # expired in the queue - comes back in the work queue elif channel.startswith('__keyspace@0__:queue:'): task_id = channel[21:] service = self._redis.hget('task:' + task_id, 'service') if service in self._services: self._logger.info('%s: move to work queue', task_id) task.work_queue(self._redis, task_id, service) time.sleep(self._work_cycle)
def run(self): signal.signal(signal.SIGTERM, graceful_exit) signal.signal(signal.SIGINT, graceful_exit) self._logger.info('Starting...') counter = 0 while True: # process one element from work queue task_id = task.work_unqueue(self._redis, self._service) if task_id is not None: try: self._advance_task(task_id) except RuntimeWarning: self._logger.warning( '%s: failed to acquire a lock, retrying', task_id) task.work_queue(self._redis, task_id, self._service) except Exception as e: self._logger.error('%s: %s', task_id, str(e)) with self._redis.acquire_lock(task_id): task.set_log(self._redis, self._taskfile_dir, task_id, str(e)) task.terminate(self._redis, task_id, phase="launch_error") self._logger.info(traceback.format_exc()) # every 0.01s * refresh_counter - check if we can find some free resource if counter > self._refresh_counter: # if there are some queued tasks, look for free resources if self._redis.exists('queued:%s' % self._service): self._logger.debug('checking processes on : %s', self._service) self._select_best_task_to_process( self._services[self._service]) counter = 0 counter += 1 time.sleep(self._work_cycle)
def run(self): signal.signal(signal.SIGTERM, graceful_exit) signal.signal(signal.SIGINT, graceful_exit) pubsub = self._redis.pubsub() pubsub.psubscribe('__keyspace@0__:beat:*') pubsub.psubscribe('__keyspace@0__:queue:*') while True: message = pubsub.get_message() if message: channel = message['channel'] data = message['data'] if data == 'expired': # task expired, not beat was received if channel.startswith('__keyspace@0__:beat:'): task_id = channel[20:] service = self._redis.hget('task:' + task_id, 'service') if service in self._services: self._logger.info('%s: task expired', task_id) with self._redis.acquire_lock(task_id): task.terminate(self._redis, task_id, phase='expired') # expired in the queue - comes back in the work queue elif channel.startswith('__keyspace@0__:queue:'): task_id = channel[21:] service = self._redis.hget('task:' + task_id, 'service') if service in self._services: self._logger.info('%s: move to work queue', task_id) task.work_queue(self._redis, task_id, service) time.sleep(self._work_cycle)
def _advance_task(self, task_id): """Tries to advance the task to the next status. If it can, re-queue it immediately to process the next stage. Otherwise, re-queue it after some delay to try again. """ keyt = 'task:%s' % task_id with self._redis.acquire_lock(keyt, acquire_timeout=1, expire_time=600): status = self._redis.hget(keyt, 'status') if status == 'stopped': return service_name = self._redis.hget(keyt, 'service') if service_name not in self._services: raise ValueError('unknown service %s' % service_name) service = self._services[service_name] self._logger.info('%s: trying to advance from status %s', task_id, status) if status == 'queued': resource = self._redis.hget(keyt, 'resource') parent = self._redis.hget(keyt, 'parent') if parent: keyp = 'task:%s' % parent # if the parent task is in the database, check for dependencies if self._redis.exists(keyp): status = self._redis.hget(keyp, 'status') if status == 'stopped': if self._redis.hget(keyp, 'message') != 'completed': task.terminate(self._redis, task_id, phase='dependency_error') return else: self._logger.warning('%s: depending on other task, waiting', task_id) task.service_queue(self._redis, task_id, service.name) return ngpus = int(self._redis.hget(keyt, 'ngpus')) resource, available_gpus = self._allocate_resource(task_id, resource, service, ngpus) if resource is not None: self._logger.info('%s: resource %s reserved (%d/%d)', task_id, resource, available_gpus, ngpus) self._redis.hset(keyt, 'alloc_resource', resource) if ngpus == available_gpus: task.set_status(self._redis, keyt, 'allocated') else: task.set_status(self._redis, keyt, 'allocating') task.work_queue(self._redis, task_id, service_name) else: self._logger.warning('%s: no resources available, waiting', task_id) task.service_queue(self._redis, task_id, service.name) elif status == 'allocating': resource = self._redis.hget(keyt, 'alloc_resource') keyr = 'resource:%s:%s' % (service.name, resource) ngpus = int(self._redis.hget(keyt, 'ngpus')) already_allocated_gpus = 0 for k, v in six.iteritems(self._redis.hgetall(keyr)): if v == task_id: already_allocated_gpus += 1 capacity = service.list_resources()[resource] available_gpus, remaining_gpus = self._reserve_resource(service, resource, capacity, task_id, ngpus - already_allocated_gpus, 0, -1, True) self._logger.warning('task: %s - resource: %s (capacity %d)- already %d - available %d', task_id, resource, capacity, already_allocated_gpus, available_gpus) if available_gpus == ngpus - already_allocated_gpus: task.set_status(self._redis, keyt, 'allocated') key_reserved = 'reserved:%s:%s' % (service.name, resource) self._redis.delete(key_reserved) task.work_queue(self._redis, task_id, service.name) else: task.work_queue(self._redis, task_id, service.name, delay=service.is_notifying_activity and 120 or 30) elif status == 'allocated': content = json.loads(self._redis.hget(keyt, 'content')) resource = self._redis.hget(keyt, 'alloc_resource') self._logger.info('%s: launching on %s', task_id, service.name) try: keyr = 'resource:%s:%s' % (service.name, resource) lgpu = [] for k, v in six.iteritems(self._redis.hgetall(keyr)): if v == task_id: lgpu.append(k) self._redis.hset(keyt, 'alloc_lgpu', ",".join(lgpu)) data = service.launch( task_id, content['options'], lgpu, resource, content['docker']['registry'], content['docker']['image'], content['docker']['tag'], content['docker']['command'], task.file_list(self._redis, task_id), content['wait_after_launch']) except EnvironmentError as e: # the resource is not available and will be set busy self._block_resource(resource, service, str(e)) # set the task as queued again self._redis.hdel(keyt, 'alloc_resource') self._release_resource(service, resource, task_id) task.set_status(self._redis, keyt, 'queued') task.service_queue(self._redis, task_id, service.name) self._logger.info('could not launch [%s] %s on %s: blocking resource', str(e), task_id, resource) return except Exception as e: # all other errors make the task fail task.append_log(self._redis, task_id, str(e)) task.terminate(self._redis, task_id, phase='launch_error') return self._logger.info('%s: task started on %s', task_id, service.name) self._redis.hset(keyt, 'job', json.dumps(data)) task.set_status(self._redis, keyt, 'running') # For services that do not notify their activity, we should # poll the task status more regularly. task.work_queue(self._redis, task_id, service.name, delay=service.is_notifying_activity and 120 or 30) elif status == 'running': self._logger.debug('- checking activity of task: %s', task_id) data = json.loads(self._redis.hget(keyt, 'job')) status = service.status(task_id, data) if status == 'dead': self._logger.info('%s: task no longer running on %s, request termination', task_id, service.name) task.terminate(self._redis, task_id, phase='exited') else: task.work_queue(self._redis, task_id, service.name, delay=service.is_notifying_activity and 120 or 30) elif status == 'terminating': data = self._redis.hget(keyt, 'job') if data is not None: container_id = self._redis.hget(keyt, 'container_id') data = json.loads(data) data['container_id'] = container_id self._logger.info('%s: terminating task (%s)', task_id, json.dumps(data)) try: service.terminate(data) self._logger.info('%s: terminated', task_id) except Exception: self._logger.warning('%s: failed to terminate', task_id) resource = self._redis.hget(keyt, 'alloc_resource') self._release_resource(service, resource, task_id) task.set_status(self._redis, keyt, 'stopped') task.disable(self._redis, task_id)
def run(self): self._logger.info('Starting worker') # Subscribe to beat expiration. pubsub = self._redis.pubsub() pubsub.psubscribe('__keyspace@0__:beat:*') pubsub.psubscribe('__keyspace@0__:queue:*') counter = 0 while True: message = pubsub.get_message() if message: channel = message['channel'] data = message['data'] if data == 'expired': self._logger.warning('received expired event on channel %s', channel) if channel.startswith('__keyspace@0__:beat:'): task_id = channel[20:] service = self._redis.hget('task:'+task_id, 'service') if service in self._services: self._logger.info('%s: task expired', task_id) with self._redis.acquire_lock(task_id): task.terminate(self._redis, task_id, phase='expired') elif channel.startswith('__keyspace@0__:queue:'): task_id = channel[21:] service = self._redis.hget('task:'+task_id, 'service') if service in self._services: task.work_queue(self._redis, task_id, service) else: for service in self._services: task_id = task.work_unqueue(self._redis, service) if task_id is not None: try: self._advance_task(task_id) except RuntimeWarning: self._logger.warning( '%s: failed to acquire a lock, retrying', task_id) task.work_queue(self._redis, task_id, service) except Exception as e: self._logger.error('%s: %s', task_id, str(e)) with self._redis.acquire_lock(task_id): task.set_log(self._redis, task_id, str(e)) task.terminate(self._redis, task_id, phase="launch_error") else: if counter > self._refresh_counter: resources = self._services[service].list_resources() for resource in resources: keyr = 'resource:%s:%s' % (service, resource) key_busy = 'busy:%s:%s' % (service, resource) key_reserved = 'reserved:%s:%s' % (service, resource) if not self._redis.exists(key_busy) and self._redis.hlen(keyr) < resources[resource]: if self._redis.exists(key_reserved) and self._redis.ttl('queue:'+self._redis.get(key_reserved))>10: self._redis.expire('queue:'+self._redis.get(key_reserved), 5) break if self._redis.exists('queued:%s' % service): resources = self._services[service].list_resources() self._logger.debug('checking processes on : %s', service) availableResource = False for resource in resources: keyr = 'resource:%s:%s' % (service, resource) key_busy = 'busy:%s:%s' % (service, resource) key_reserved = 'reserved:%s:%s' % (service, resource) if not self._redis.exists(key_busy) and self._redis.hlen(keyr) < resources[resource]: if not self._redis.exists(key_reserved): availableResource = True break if availableResource: self._logger.debug('resources available on %s - trying dequeuing', service) self._service_unqueue(self._services[service]) if counter > self._refresh_counter: counter = 0 counter += 1 time.sleep(0.01)
# remove reserved state from resources for key in redis.keys('reserved:%s:*' % service): redis.delete(key) # remove queued tasks on service for key in redis.keys('queued:%s' % service): redis.delete(key) # On startup, add all active tasks in the work queue or service queue for task_id in task.list_active(redis, service): with redis.acquire_lock(task_id): status = redis.hget('task:'+task_id, 'status') if status == 'queued' or status == 'allocating' or status == 'allocated': task.service_queue(redis, task_id, redis.hget('task:'+task_id, 'service')) task.set_status(redis, 'task:'+task_id, 'queued') else: task.work_queue(redis, task_id, service) # check integrity of tasks if redis.hget(task_id, 'priority') is None: redis.hset(task_id, 'priority', 0) if redis.hget(task_id, 'queued_time') is None: redis.hset(task_id, 'queued_time', time.time()) # Desallocate all resources that are not anymore associated to a running task resources = services[service].list_resources() for resource in resources: keyr = 'resource:%s:%s' % (service, resource) running_tasks = redis.hgetall(keyr) for g, task_id in six.iteritems(running_tasks): with redis.acquire_lock(task_id): status = redis.hget('task:'+task_id, 'status')
def _select_best_task_to_process(self, service): """find the best next task to push to the work queue """ class EntityUsage: def __init__(self, current_usage, entity_name, usage_coeff): self._entity = entity_name self._current_usage_capacity = current_usage if current_usage else Capacity( ) self._usage_coeff = usage_coeff def __str__(self): return 'EntityUsage (%s, Absolute usage :%s . Weighted usage : %s. Weight:%f)' % ( self._entity, self._current_usage_capacity, self._weighted_usage, self._usage_coeff) @property def _weighted_usage(self): return self._current_usage_capacity.ncpus * self._usage_coeff,\ self._current_usage_capacity.ngpus * self._usage_coeff def add_current_usage(self, current_usage): self._current_usage_capacity += current_usage def __eq__(self, other): return self._weighted_usage[0] == other._weighted_usage[0] and \ self._weighted_usage[1] == other._weighted_usage[1] def __lt__(self, other): return self._weighted_usage[1] < other._weighted_usage[1] or \ (self._weighted_usage[1] == other._weighted_usage[1] and self._weighted_usage[0] < other._weighted_usage[0]) def __le__(self, other): return self == other or self < other @staticmethod def initialize_entities_usage(mongo_client, service_name): entity_usage_weights = config.get_entities_limit_rate( mongo_client, service_name) weight_sum = float( sum([w for w in entity_usage_weights.values() if w > 0])) entities_usage = { e: EntityUsage(None, e, float(weight_sum) / r if r > 0 else 0) for e, r in six.iteritems(entity_usage_weights) } return entities_usage class CandidateTask: def __init__(self, task_id, task_entity, redis, task_capacity, entity_usage, logger): assert task_id self._task_id = task_id self._entity = task_entity self._redis_key = 'task:%s' % next_task_id self._priority = int(redis.hget(self._redis_key, 'priority')) self._launched_time = float( redis.hget(self._redis_key, 'launched_time')) self._runnable_machines = set() self._capacity = task_capacity self._entity_usage = entity_usage self._logger = logger def __str__(self): return "Task ( %s / %s ; %s ; Priority:%d)" % ( self._task_id, self._capacity, self._entity_usage, self._priority) def __gt__(self, other): return self.is_higher_priority(other) def __ge__(self, other): return self.is_higher_priority(other) def _already_on_node(self): result = self._task_id in resource_mgr.preallocated_task_resource return result def _is_more_respectful_usage(self, other): if self._entity == other._entity: # same entity, go for highest priority is_more_prio = self._priority > other._priority or ( self._priority == other._priority and self._launched_time < other._launched_time) return is_more_prio my_entity_usage = resource_mgr.entities_usage[self._entity] other_entity_usage = resource_mgr.entities_usage[other._entity] if my_entity_usage == other_entity_usage: return self._launched_time < other._launched_time result = my_entity_usage < other_entity_usage self._logger.debug( "AZ-COMPUSE: my: %s.Other: %s . Result = %s", my_entity_usage, other_entity_usage, result) return result def is_higher_priority(self, other_task): # Decision tree for the most priority task if not other_task: return True # go for already allocated resource task if self._already_on_node(): if not other_task._already_on_node(): return True return self._is_more_respectful_usage(other_task) if other_task._already_on_node(): return False return self._is_more_respectful_usage(other_task) @staticmethod def try_create(next_task_id): next_keyt = 'task:%s' % next_task_id parent = self._redis.hget(next_keyt, 'parent') task_entity = task.get_owner_entity(self._redis, next_task_id) if task_entity not in resource_mgr.entities_usage: self._logger.error( "\t[Task %s] entity %s - without usage limit !", next_task_id, task_entity) return None # check parent dependency if parent: keyp = 'task:%s' % parent if self._redis.exists(keyp): # if the parent task is in the database, check for dependencies parent_status = self._redis.hget(keyp, 'status') if parent_status != 'stopped': if parent_status == 'running': # parent is still running so update queued time to be as close # as possible to terminate time of parent task self._redis.hset(next_keyt, "queued_time", time.time()) return None if self._redis.hget(keyp, 'message') != 'completed': task.terminate(self._redis, next_task_id, phase='dependency_error') return None task_capacity = Capacity(self._redis.hget(next_keyt, 'ngpus'), self._redis.hget(next_keyt, 'ncpus')) candidate_task = CandidateTask( next_task_id, task_entity, self._redis, task_capacity, resource_mgr.entities_usage[task_entity], self._logger) # check now the task has a chance to be processed by any machine for _, machine in six.iteritems(resource_mgr._machines): can_be_processed = machine._is_authorized(candidate_task._entity, candidate_task._capacity) \ and candidate_task._capacity.inf_or_eq(machine._init_capacity) if can_be_processed: return candidate_task return None class ResourceManager: def __init__(self, worker): self.preallocated_task_resource = {} resources = service.list_resources() self._machines = { res: Worker.Machine(service, res, resources[res], worker._logger, service.get_server_detail(res, "priority")) for res in resources } self.entities_usage = {} self.worker = worker def __str__(self): msg = " - ".join(str(m) for m in self._machines.values()) return "ResourceManager ( %s )." % msg def load_machines(self, service_name): self.entities_usage = EntityUsage.initialize_entities_usage( self.worker._mongo_client, service_name) for resource, machine in six.iteritems(self._machines): current_xpu_usage = Capacity() keygr = 'gpu_resource:%s:%s' % (self.worker._service, resource) keycr = 'cpu_resource:%s:%s' % (self.worker._service, resource) gpu_tasks = self.worker._redis.hgetall(keygr) cpu_tasks = self.worker._redis.hgetall(keycr) # can not launch multiple tasks on service with no multi-tasking (ec2) # or launch multiple tasks on service with hybrid task mode and dynamic resource mode (nova) if not _is_resource_multitask( service, resource) and (gpu_tasks or cpu_tasks): continue tmp_tasks = {} for _, v in six.iteritems(gpu_tasks): if v not in tmp_tasks: task_entity = task.get_owner_entity( self.worker._redis, v) tmp_tasks[v] = task_entity else: task_entity = tmp_tasks[v] if v not in self.preallocated_task_resource: self.preallocated_task_resource[v] = resource self._machines[resource].add_task( v, self.worker._redis) current_xpu_usage.incr_ngpus(1) self.entities_usage[task_entity].add_current_usage( Capacity(ngpus=1)) for _, v in six.iteritems(cpu_tasks): if v not in tmp_tasks: task_entity = task.get_owner_entity( self.worker._redis, v) tmp_tasks[v] = task_entity else: task_entity = tmp_tasks[v] if v not in self.preallocated_task_resource: self.preallocated_task_resource[v] = resource self._machines[resource].add_task( v, self.worker._redis) current_xpu_usage.incr_ncpus(1) self.entities_usage[task_entity].add_current_usage( Capacity(ncpus=1)) available_xpus = machine._init_capacity - current_xpu_usage self._machines[resource].set_available(available_xpus) self.worker._logger.debug("\tresource %s: - free %s", resource, available_xpus) return len(resource_mgr._machines) > 0 with self._redis.acquire_lock('service:' + service.name): queue = 'queued:%s' % service.name count = self._redis.llen(queue) if count == 0: return resource_mgr = ResourceManager(self) if not resource_mgr.load_machines(service.name): return runnable_tasks = [] for e in resource_mgr.entities_usage.values(): self._logger.debug("[AZ-USE] %s", e) while count > 0: count -= 1 next_task_id = self._redis.lindex(queue, count) candidate_task = CandidateTask.try_create(next_task_id) if candidate_task: runnable_tasks.append(candidate_task) num_of_runnable_tasks = len(runnable_tasks) self._logger.info('Runnable task count: %d', num_of_runnable_tasks) if num_of_runnable_tasks > 0: sorted_runnable_tasks = sorted(runnable_tasks, reverse=True) for runnable_task in sorted_runnable_tasks: task_id = runnable_task._task_id nxpus = runnable_task._capacity keyt = 'task:%s' % task_id request_resource = self._redis.hget(keyt, 'resource') allocated_resource = self._allocate_resource( task_id, request_resource, service, nxpus) if allocated_resource is not None: self._logger.info('%s: resource %s reserved %s', task_id, allocated_resource, nxpus) self._redis.hset(keyt, 'alloc_resource', allocated_resource) task.set_status(self._redis, keyt, 'allocated') task.work_queue(self._redis, task_id, service.name) self._redis.lrem(queue, 0, task_id) self._logger.info( '[AZ-SELECTED] %s to be launched on %s', task_id, service.name) break self._logger.info( '[AZ-SELECTED] %s to be launched on %s, but not able to allocate resource', task_id, service.name)
def _handle_allocated_task(self, task_id): keyt = 'task:%s' % task_id _, service = self._get_service(keyt=keyt) content = json.loads(self._redis.hget(keyt, 'content')) resource = self._redis.hget(keyt, 'alloc_resource') self._logger.info('%s: launching on %s', task_id, service.name) try: entity_config = self._get_current_config(task_id) keygr = 'gpu_resource:%s:%s' % (service.name, resource) lgpu = [] for k, v in six.iteritems(self._redis.hgetall(keygr)): if v == task_id: lgpu.append(k) self._redis.hset(keyt, 'alloc_lgpu', ",".join(lgpu)) keycr = 'cpu_resource:%s:%s' % (service.name, resource) lcpu = [] for k, v in six.iteritems(self._redis.hgetall(keycr)): if v == task_id: lcpu.append(k) self._redis.hset(keyt, 'alloc_lcpu', ",".join(lcpu)) data = service.launch( task_id, content['options'], (lgpu, lcpu), resource, entity_config["storages"], entity_config["docker"], content['docker']['registry'], content['docker']['image'], content['docker']['tag'], content['docker']['command'], task.file_list(self._taskfile_dir, task_id), content['wait_after_launch'], self._redis.hget(keyt, 'token'), content.get('support_statistics')) except EnvironmentError as e: # the resource is not available and will be set busy self._block_resource(resource, service, str(e)) self._redis.hdel(keyt, 'alloc_resource') # set the task as queued again self._release_resource( service, resource, task_id, Capacity(self._redis.hget(keyt, 'ngpus'), self._redis.hget(keyt, 'ncpus'))) status = self._redis.hget(keyt, 'status') if status == 'terminating': return None task.set_status(self._redis, keyt, 'queued') task.service_queue(self._redis, task_id, service.name) self._logger.info( 'could not launch [%s] %s on %s: blocking resource', str(e), task_id, resource) self._logger.info(traceback.format_exc()) return None except Exception as e: # all other errors make the task fail self._logger.info('fail task [%s] - %s', task_id, str(e)) self._logger.info(traceback.format_exc()) task.append_log(self._taskfile_dir, task_id, str(e)) auth_token = self._redis.hget(keyt, 'token') callback_url = service._config.get('callback_url') if auth_token: callback_url = callback_url.replace("://", "://" + auth_token + ":x@") r = requests.get(os.path.join(callback_url, "task/terminate", task_id), params={'phase': 'launch_error'}) if r.status_code != 200: raise RuntimeError( 'incorrect result from \'task/terminate\' service: %s' % r.text) from e task.terminate(self._redis, task_id, phase='launch_error') self._logger.info(traceback.format_exc()) return None self._logger.info('%s: task started on %s', task_id, service.name) self._redis.hset(keyt, 'job', json.dumps(data)) status = self._redis.hget(keyt, 'status') if status == 'terminating': return None task.set_status(self._redis, keyt, 'running') # For services that do not notify their activity, we should # poll the task status more regularly. task.work_queue(self._redis, task_id, service.name, delay=service.is_notifying_activity and 120 or 30) return None
def _service_unqueue(self, service): """find the best next task to push to the work queue """ with self._redis.acquire_lock('service:' + service.name): queue = 'queued:%s' % service.name count = self._redis.llen(queue) idx = 0 preallocated_task_count = {} preallocated_task_resource = {} avail_resource = {} resources = service.list_resources() reserved = {} # list free cpu/gpus on each node for resource in resources: current_xpu_usage = Capacity() capacity = resources[resource] keygr = 'gpu_resource:%s:%s' % (self._service, resource) keycr = 'cpu_resource:%s:%s' % (self._service, resource) key_reserved = 'reserved:%s:%s' % (service.name, resource) gpu_tasks = self._redis.hgetall(keygr) cpu_tasks = self._redis.hgetall(keycr) task_reserved = self._redis.get(key_reserved) # can not launch multiple tasks on service with no multi-tasking (ec2) if not service.resource_multitask and \ not task_reserved and \ (gpu_tasks or cpu_tasks): continue for k, v in six.iteritems(gpu_tasks): if v in preallocated_task_count: preallocated_task_count[v].incr_ngpus(1) else: preallocated_task_count[v] = Capacity(ngpus=1) preallocated_task_resource[v] = resource current_xpu_usage.incr_ngpus(1) for k, v in six.iteritems(cpu_tasks): if v in preallocated_task_count: preallocated_task_count[v].incr_ncpus(1) else: preallocated_task_count[v] = Capacity(ncpus=1) preallocated_task_resource[v] = resource current_xpu_usage.incr_ncpus(1) available_xpus = capacity - current_xpu_usage avail_resource[resource] = available_xpus reserved[resource] = task_reserved self._logger.debug("\tresource %s - reserved: %s - free %s", resource, task_reserved or "False", available_xpus) if len(avail_resource) == 0: return # Go through the tasks, find if there are tasks that can be launched and # queue the best one best_task_id = None best_task_priority = -10000 best_task_queued_time = 0 while count > 0: count -= 1 next_task_id = self._redis.lindex(queue, count) if next_task_id is not None: next_keyt = 'task:%s' % next_task_id # self._logger.debug("\tcheck task: %s", next_task_id) parent = self._redis.hget(next_keyt, 'parent') # check parent dependency if parent: keyp = 'task:%s' % parent if self._redis.exists(keyp): # if the parent task is in the database, check for dependencies parent_status = self._redis.hget(keyp, 'status') if parent_status != 'stopped': if parent_status == 'running': # parent is still running so update queued time to be as close # as possible to terminate time of parent task self._redis.hset(next_keyt, "queued_time", time.time()) continue else: if self._redis.hget(keyp, 'message') != 'completed': task.terminate(self._redis, next_task_id, phase='dependency_error') continue nxpus = Capacity(self._redis.hget(next_keyt, 'ngpus'), self._redis.hget(next_keyt, 'ncpus')) foundResource = False if next_task_id in preallocated_task_count: # if task is pre-allocated, can only continue on the same node r = preallocated_task_resource[next_task_id] nxpus -= preallocated_task_count[next_task_id] avail_r = avail_resource[r] foundResource = (nxpus.ngpus == 0 and avail_r.ncpus != 0) or (nxpus.ngpus != 0 and avail_r.ngpus != 0) else: # can the task be launched on any node for r, v in six.iteritems(avail_resource): # cannot launch a new task on a reserved node if reserved[r]: continue if ((nxpus.ngpus > 0 and resources[r].ngpus >= nxpus.ngpus and v.ngpus > 0) or (nxpus.ngpus == 0 and v.ncpus >= 0)): foundResource = True break if not foundResource: continue priority = int(self._redis.hget(next_keyt, 'priority')) queued_time = float( self._redis.hget(next_keyt, 'queued_time')) if priority > best_task_priority or ( priority == best_task_priority and best_task_queued_time > queued_time): best_task_priority = priority best_task_id = next_task_id best_task_queued_time = queued_time if best_task_id: self._logger.info('selected %s to be launched on %s', best_task_id, service.name) task.work_queue(self._redis, best_task_id, service.name) self._redis.lrem(queue, 0, best_task_id)
def run(self): self._logger.info('Starting worker') # Subscribe to beat expiration. pubsub = self._redis.pubsub() pubsub.psubscribe('__keyspace@0__:beat:*') pubsub.psubscribe('__keyspace@0__:queue:*') counter = 0 counter_beat = 1000 while True: counter_beat += 1 # every 1000 * 0.01s (10s) - check & reset beat of the worker if counter_beat > 1000: counter_beat = 0 if self._redis.exists(self._worker_id): self._redis.hset(self._worker_id, "beat_time", time.time()) self._redis.expire(self._worker_id, 1200) else: self._logger.info('stopped by key expiration/removal') sys.exit(0) # every 100 * 0.01s (1s) - check worker administration command if counter_beat % 100 == 0: workeradmin.process(self._logger, self._redis, self._service) if (self._default_config_timestamp and self._redis.hget('default', 'timestamp') != self._default_config_timestamp): self._logger.info( 'stopped by default configuration change') sys.exit(0) # process one message from the queue message = pubsub.get_message() if message: channel = message['channel'] data = message['data'] if data == 'expired': # task expired, not beat was received if channel.startswith('__keyspace@0__:beat:'): task_id = channel[20:] service = self._redis.hget('task:' + task_id, 'service') if service in self._services: self._logger.info('%s: task expired', task_id) with self._redis.acquire_lock(task_id): task.terminate(self._redis, task_id, phase='expired') # expired in the queue - comes back in the work queue elif channel.startswith('__keyspace@0__:queue:'): task_id = channel[21:] service = self._redis.hget('task:' + task_id, 'service') if service in self._services: self._logger.info('%s: move to work queue', task_id) task.work_queue(self._redis, task_id, service) # process one element from work queue task_id = task.work_unqueue(self._redis, self._service) if task_id is not None: try: self._advance_task(task_id) except RuntimeWarning: self._logger.warning( '%s: failed to acquire a lock, retrying', task_id) task.work_queue(self._redis, task_id, self._service) except Exception as e: self._logger.error('%s: %s', task_id, str(e)) with self._redis.acquire_lock(task_id): task.set_log(self._redis, self._taskfile_dir, task_id, str(e)) task.terminate(self._redis, task_id, phase="launch_error") # every 0.01s * refresh_counter - check if we can find some free resource if counter > self._refresh_counter: # if there are some queued tasks, look for free resources if self._redis.exists('queued:%s' % self._service): self._logger.debug('checking processes on : %s', self._service) self._service_unqueue(self._services[self._service]) counter = 0 counter += 1 time.sleep(0.01)
def _service_unqueue(self, service): """find the best next task to push to the work queue """ with self._redis.acquire_lock('service:' + service.name): queue = 'queued:%s' % service.name count = self._redis.llen(queue) idx = 0 preallocated_task_count = {} preallocated_task_resource = {} avail_resource = {} resources = service.list_resources() reserved = {} # list free cpu/gpus on each node for resource in resources: keyr = 'gpu_resource:%s:%s' % (self._service, resource) keyc = 'ncpus:%s:%s' % (self._service, resource) available_cpus = int(self._redis.get(keyc)) current_gpu_usage = 0 gpu_capacity = resources[resource] for k, v in six.iteritems(self._redis.hgetall(keyr)): if v in preallocated_task_count: preallocated_task_count[v] += 1 else: preallocated_task_count[v] = 1 preallocated_task_resource[v] = resource current_gpu_usage += 1 available_gpus = gpu_capacity - current_gpu_usage avail_resource[resource] = (available_cpus, available_gpus) key_reserved = 'reserved:%s:%s' % (service.name, resource) reserved[resource] = self._redis.get(key_reserved) self._logger.debug( "\tresource %s - reserved: %s - free gpus: %d, cpus: %d", resource, reserved[resource] or "False", available_gpus, available_cpus) # Go through the task, find if there are tasks that can be launched and # queue the best one best_task_id = None best_task_priority = -10000 best_task_queued_time = 0 while count > 0: count -= 1 next_task_id = self._redis.lindex(queue, count) if next_task_id is not None: next_keyt = 'task:%s' % next_task_id # self._logger.debug("\tcheck task: %s", next_task_id) parent = self._redis.hget(next_keyt, 'parent') # check parent dependency if parent: keyp = 'task:%s' % parent if self._redis.exists(keyp): # if the parent task is in the database, check for dependencies parent_status = self._redis.hget(keyp, 'status') if parent_status != 'stopped': if parent_status == 'running': # parent is still running so update queued time to be as close # as possible to terminate time of parent task self._redis.hset(next_keyt, "queued_time", time.time()) continue else: if self._redis.hget(keyp, 'message') != 'completed': task.terminate(self._redis, next_task_id, phase='dependency_error') continue ngpus = int(self._redis.hget(next_keyt, 'ngpus')) ncpus = int(self._redis.hget(next_keyt, 'ncpus')) foundResource = False if next_task_id in preallocated_task_count: # if task is pre-allocated, can only continue on the same node r = preallocated_task_resource[next_task_id] ngpus -= preallocated_task_count[next_task_id] avail_r = avail_resource[r] foundResource = (ngpus == 0 or avail_r[1] != 0) and ( ngpus != 0 or ncpus <= avail_r[1]) else: # can the task be launched on any node for r, v in six.iteritems(avail_resource): # cannot launch a new task on a reserved node if reserved[r]: continue if ((ngpus > 0 and resources[r] >= ngpus and v[1] > 0) or (ngpus == 0 and v[0] >= ncpus)): foundResource = True break if not foundResource: continue priority = int(self._redis.hget(next_keyt, 'priority')) queued_time = float( self._redis.hget(next_keyt, 'queued_time')) if priority > best_task_priority or ( priority == best_task_priority and best_task_queued_time > queued_time): best_task_priority = priority best_task_id = next_task_id best_task_queued_time = queued_time if best_task_id: self._logger.info('selected %s to be launched on %s', best_task_id, service.name) task.work_queue(self._redis, best_task_id, service.name) self._redis.lrem(queue, 0, best_task_id)
for key in redis.keys('busy:*'): redis.delete(key) # remove reserved state from resources for key in redis.keys('reserved:*'): redis.delete(key) # On startup, add all active tasks in the work queue. for task_id in task.list_active(redis): with redis.acquire_lock(task_id): status = redis.hget('task:' + task_id, 'status') if status == 'queue' or status == 'allocating' or status == 'allocated': task.service_queue(redis, task_id, redis.hget('task:' + task_id, 'service')) task.set_status(redis, 'task:' + task_id, 'queued') else: task.work_queue(redis, task_id) # Desallocate all resources that are not anymore associated to a running task for service in services: resources = services[service].list_resources() for resource in resources: keyr = 'resource:%s:%s' % (service, resource) running_tasks = redis.hgetall(keyr) for g, task_id in six.iteritems(running_tasks): with redis.acquire_lock(task_id): status = redis.hget('task:' + task_id, 'status') if not (status == 'running' or status == 'terminating'): redis.hdel(keyr, g) # TODO: start multiple workers here? worker = Worker(redis, services, cfg.getint('default', 'refresh_counter'),