def load_machines(self, service_name): self.entities_usage = EntityUsage.initialize_entities_usage( self.worker._mongo_client, service_name) for resource, machine in six.iteritems(self._machines): current_xpu_usage = Capacity() keygr = 'gpu_resource:%s:%s' % (self.worker._service, resource) keycr = 'cpu_resource:%s:%s' % (self.worker._service, resource) gpu_tasks = self.worker._redis.hgetall(keygr) cpu_tasks = self.worker._redis.hgetall(keycr) # can not launch multiple tasks on service with no multi-tasking (ec2) # or launch multiple tasks on service with hybrid task mode and dynamic resource mode (nova) if not _is_resource_multitask( service, resource) and (gpu_tasks or cpu_tasks): continue tmp_tasks = {} for _, v in six.iteritems(gpu_tasks): if v not in tmp_tasks: task_entity = task.get_owner_entity( self.worker._redis, v) tmp_tasks[v] = task_entity else: task_entity = tmp_tasks[v] if v not in self.preallocated_task_resource: self.preallocated_task_resource[v] = resource self._machines[resource].add_task( v, self.worker._redis) current_xpu_usage.incr_ngpus(1) self.entities_usage[task_entity].add_current_usage( Capacity(ngpus=1)) for _, v in six.iteritems(cpu_tasks): if v not in tmp_tasks: task_entity = task.get_owner_entity( self.worker._redis, v) tmp_tasks[v] = task_entity else: task_entity = tmp_tasks[v] if v not in self.preallocated_task_resource: self.preallocated_task_resource[v] = resource self._machines[resource].add_task( v, self.worker._redis) current_xpu_usage.incr_ncpus(1) self.entities_usage[task_entity].add_current_usage( Capacity(ncpus=1)) available_xpus = machine._init_capacity - current_xpu_usage self._machines[resource].set_available(available_xpus) self.worker._logger.debug("\tresource %s: - free %s", resource, available_xpus) return len(resource_mgr._machines) > 0
def __init__(self, config): super().__init__(config) self._nova_client = init_nova_client(config) self._templates = [] self._resources = {} self._machines = {} for template in config['variables']['template_pool']: instance_type = template['name'] if instance_type not in ovh_capacity_map: raise ValueError('unknown instance type: %s' % instance_type) xpu = ovh_capacity_map[instance_type] try: flavor = self._nova_client.flavors.find(name=instance_type) except novaclient.exceptions.NotFound as e: raise e template["id"] = flavor.id template["name"] = flavor.name template["gpus"] = range(xpu.ngpus) template["cpus"] = range(xpu.ncpus) maxInstances = template.get("maxInstances", 1) self._templates.append(template) for idx in range(maxInstances): self._resources["%s:%d" % (template["name"], idx)] = \ Capacity(len(template["gpus"]), len(template["cpus"])) self._machines["%s:%d" % (template["name"], idx)] = template logger.info("Initialized OVH instance - found %d templates.", len(config['variables']['template_pool']))
def _handle_terminating_task(self, task_id): keyt = 'task:%s' % task_id _, service = self._get_service(keyt=keyt) data = self._redis.hget(keyt, 'job') nxpus = Capacity(self._redis.hget(keyt, 'ngpus'), self._redis.hget(keyt, 'ncpus')) if data is not None: container_id = self._redis.hget(keyt, 'container_id') data = json.loads(data) data['container_id'] = container_id self._logger.info('%s: terminating task (job: %s)', task_id, json.dumps(data)) try: service.terminate(data) self._logger.info('%s: terminated', task_id) except Exception: self._logger.warning('%s: failed to terminate', task_id) self._logger.info(traceback.format_exc()) else: self._logger.info('%s: terminating task (on error)', task_id) resource = self._redis.hget(keyt, 'alloc_resource') if resource: self._release_resource(service, resource, task_id, nxpus) task.set_status(self._redis, keyt, 'stopped') task.disable(self._redis, task_id)
def __init__(self, service, name, initial_capacity, logger): self._init_capacity = initial_capacity self._name = name self._available_cap = Capacity() self._tasks = {} self._logger = logger self._service = service
def list_resources(self): resources = { server: Capacity(len(self._machines[server]['gpus']), len(self._machines[server]['cpus'])) for server in self._machines } return resources
def total_capacity(self): """Total capacity of the service (i.e. the total number of tasks that can run at the same time). """ tc = Capacity() for v in six.itervalues(self.list_resources()): tc += v return tc
def _usagecapacity(service): """calculate the current usage of the service.""" usage_xpu = Capacity() capacity_xpus = Capacity() busy = 0 detail = {} for resource in service.list_resources(): detail[resource] = {'busy': '', 'reserved': ''} r_capacity = service.list_resources()[resource] detail[resource]['capacity'] = r_capacity capacity_xpus += r_capacity reserved = redis.get("reserved:%s:%s" % (service.name, resource)) if reserved: detail[resource]['reserved'] = reserved count_map_gpu = Counter() count_map_cpu = Counter() task_type = {} count_used_xpus = Capacity() r_usage_gpu = redis.hgetall("gpu_resource:%s:%s" % (service.name, resource)).values() for t in r_usage_gpu: if t not in task_type: task_type[t] = redis.hget("task:%s" % t, "type") count_map_gpu[t] += 1 count_used_xpus.incr_ngpus(1) r_usage_cpu = redis.hgetall("cpu_resource:%s:%s" % (service.name, resource)).values() for t in r_usage_cpu: if t not in task_type: task_type[t] = redis.hget("task:%s" % t, "type") count_map_cpu[t] += 1 count_used_xpus.incr_ncpus(1) detail[resource]['usage'] = [ "%s %s: %d (%d)" % (task_type[t], t, count_map_gpu[t], count_map_cpu[t]) for t in task_type ] detail[resource][ 'avail_gpus'] = r_capacity.ngpus - count_used_xpus.ngpus detail[resource][ 'avail_cpus'] = r_capacity.ncpus - count_used_xpus.ncpus err = redis.get("busy:%s:%s" % (service.name, resource)) if err: detail[resource]['busy'] = err busy = busy + 1 usage_xpu += count_used_xpus queued = redis.llen("queued:" + service.name) return ("%d (%d)" % (usage_xpu.ngpus, usage_xpu.ncpus), queued, "%d (%d)" % (capacity_xpus.ngpus, capacity_xpus.ncpus), busy, detail)
def __init__(self, service, name, initial_capacity, logger, priority=None): self._init_capacity = initial_capacity self._name = name self._available_cap = Capacity() self._tasks = {} self._logger = logger self._service = service self._priority = priority if priority else 1
def _allocate_resource(self, task_id, request_resource, service, nxpus): """Allocates a resource for task_id and returns the name of the resource (or None if none where allocated), and the number of allocated gpus/cpus """ best_resource = None br_available_xpus = Capacity() br_remaining_xpus = Capacity(-1, -1) resources = service.list_resources() for name, capacity in six.iteritems(resources): if _compatible_resource(name, request_resource): available_xpus, remaining_xpus = self._reserve_resource( service, name, capacity, task_id, nxpus, br_available_xpus, br_remaining_xpus) if available_xpus is not False: if best_resource is not None: self._release_resource(service, best_resource, task_id, nxpus) best_resource = name br_remaining_xpus = remaining_xpus br_available_xpus = available_xpus return best_resource, br_available_xpus
def __init__(self, task_infos, must_patch_config_name=False): self._content = deepcopy(task_infos.content) self._lang_pair = f'{task_infos.request_data["source"]}{task_infos.request_data["target"]}' if not self._lang_pair and self._parent_task_id: self._lang_pair = self._parent_task_id.split("_")[1] self._service = task_infos.service self._service_config = task_infos.routes_configuration.service_config self._service_module = task_infos.routes_configuration.service_module self._files = task_infos.files self.other_task_info = { TaskEnum.ENTITY_OWNER.value: task_infos.routes_configuration.entity_owner, TaskEnum.STORAGE_ENTITIES.value: json.dumps(task_infos.routes_configuration.trainer_entities) } if task_infos.other_infos: self.update_other_infos(task_infos.other_infos) self._priority = self._content.get("priority", 0) self._resource = task_infos.resource if self._task_suffix: self.task_id, explicit_name = build_task_id( self._content, self._lang_pair, self._task_suffix, self._parent_task_id) if must_patch_config_name: TaskBase.patch_config_explicit_name(self._content, explicit_name) if self._resource: self._resource = self._service_module.select_resource_from_capacity( self._resource, Capacity(self._content["ngpus"], self._content["ncpus"])) else: self._resource = self._service_module.select_resource_from_capacity( self._service_module.get_resource_from_options( self._content["options"]), Capacity(self._content["ngpus"], self._content["ncpus"]))
def try_create(next_task_id): next_keyt = 'task:%s' % next_task_id parent = self._redis.hget(next_keyt, 'parent') task_entity = task.get_owner_entity(self._redis, next_task_id) if task_entity not in resource_mgr.entities_usage: self._logger.error( "\t[Task %s] entity %s - without usage limit !", next_task_id, task_entity) return None # check parent dependency if parent: keyp = 'task:%s' % parent if self._redis.exists(keyp): # if the parent task is in the database, check for dependencies parent_status = self._redis.hget(keyp, 'status') if parent_status != 'stopped': if parent_status == 'running': # parent is still running so update queued time to be as close # as possible to terminate time of parent task self._redis.hset(next_keyt, "queued_time", time.time()) return None if self._redis.hget(keyp, 'message') != 'completed': task.terminate(self._redis, next_task_id, phase='dependency_error') return None task_capacity = Capacity(self._redis.hget(next_keyt, 'ngpus'), self._redis.hget(next_keyt, 'ncpus')) candidate_task = CandidateTask( next_task_id, task_entity, self._redis, task_capacity, resource_mgr.entities_usage[task_entity], self._logger) # check now the task has a chance to be processed by any machine for _, machine in six.iteritems(resource_mgr._machines): can_be_processed = machine._is_authorized(candidate_task._entity, candidate_task._capacity) \ and candidate_task._capacity.inf_or_eq(machine._init_capacity) if can_be_processed: return candidate_task return None
def _distribute_machine_for_task(self, task_id, task_entity, task_expected_capacity, request_resource, service, machines): best_resource = None br_remaining_xpus = Capacity(-1, -1) for name, machine in six.iteritems(machines): if _compatible_resource( name, request_resource) and machine._is_authorized( task_entity, task_expected_capacity): better_remaining_xpus = self._reserve_resource( service, name, machine._init_capacity, task_id, task_expected_capacity, br_remaining_xpus) if better_remaining_xpus is not None: if best_resource is not None: self._release_resource(service, best_resource, task_id, task_expected_capacity) best_resource = name br_remaining_xpus = better_remaining_xpus return best_resource
def __init__(self, config): super().__init__(config) self._session = boto3.Session( aws_access_key_id=config["variables"]["awsAccessKeyId"], aws_secret_access_key=config["variables"]["awsSecretAccessKey"], region_name=config["variables"]["awsRegion"]) ec2_client = self._session.client("ec2") self._templates = [] self._resources = {} self._machines = {} for template in config['variables']['template_pool']: response = ec2_client.describe_launch_template_versions( DryRun=False, LaunchTemplateName=template['name'], Filters=[{ 'Name': 'is-default-version', 'Values': ["true"] }]) if not response or not response["LaunchTemplateVersions"]: raise ValueError('cannot retrieve launch template') template_description = response["LaunchTemplateVersions"][0] if "LaunchTemplateData" not in template_description: raise ValueError('invalid template_description') launch_template_data = template_description["LaunchTemplateData"] if "InstanceType" not in launch_template_data or \ launch_template_data["InstanceType"] not in ec2_capacity_map: raise ValueError('unknown instance type: %s' % launch_template_data["InstanceType"]) xpu = ec2_capacity_map[launch_template_data["InstanceType"]] maxInstances = template.get("maxInstances", 1) template["id"] = template_description["LaunchTemplateId"] template["name"] = template_description["LaunchTemplateName"] template["gpus"] = range(xpu.ngpus) template["cpus"] = range(xpu.ncpus) self._templates.append(template) for idx in range(maxInstances): self._resources["%s:%d" % (template["name"], idx)] = \ Capacity(len(template["gpus"]), len(template["cpus"])) self._machines["%s:%d" % (template["name"], idx)] = template logger.info("Initialized EC2 - found %d templates.", len(config['variables']['template_pool']))
def _reserve_resource(self, service, resource, capacity, task_id, nxpus, br_available_xpus, br_remaining_xpus, check_reserved=False): """Reserves the resource for task_id, if possible. The resource is locked while we try to reserve it. Resource should have more gpus available (within ngpus) than br_available_xpus or the same number but a smaller size """ for idx, val in enumerate(capacity): if val < nxpus[idx]: return False, False keygr = 'gpu_resource:%s:%s' % (service.name, resource) keycr = 'cpu_resource:%s:%s' % (service.name, resource) key_busy = 'busy:%s:%s' % (service.name, resource) key_reserved = 'reserved:%s:%s' % (service.name, resource) with self._redis.acquire_lock(keygr): if self._redis.get(key_busy) is not None: return False, False # if we need gpus allocated_gpu = 0 allocated_cpu = 0 remaining_gpus = 0 remaining_cpus = 0 # allocate GPU first. For GPU we want to minimise the fragmentation, so minimize # br_remainining_xpus.ngpus if nxpus.ngpus != 0: # do not allocate several run on the same GPU current_usage_gpu = self._redis.hlen(keygr) if current_usage_gpu > 0 and not service.resource_multitask: return False, False # available gpu is the capacity of the node less number of gpu used avail_gpu = capacity.ngpus - current_usage_gpu allocated_gpu = min(avail_gpu, nxpus.ngpus) remaining_gpus = avail_gpu - allocated_gpu if (allocated_gpu > 0 and ((allocated_gpu > br_available_xpus.ngpus) or (allocated_gpu == br_available_xpus.ngpus and remaining_gpus < br_remaining_xpus.ngpus))): idx = 1 for i in xrange(allocated_gpu): while self._redis.hget(keygr, str(idx)) is not None: idx += 1 assert idx <= capacity.ngpus, "invalid gpu alloc for %s" % keygr self._redis.hset(keygr, str(idx), task_id) else: return False, False # if we don't need to allocate GPUs anymore, start allocating CPUs # * for CPU on multitask service we want to maximize the remaining CPU # to avoid loading too much individual servers # * for CPU on monotask service, we want to minimize the remaining CPU # to avoid loading on a over-dimensioned service if allocated_gpu == nxpus.ngpus and nxpus.ncpus != 0: current_usage_cpu = self._redis.hlen(keycr) if current_usage_cpu > 0 and not service.resource_multitask: return False, False avail_cpu = capacity.ncpus - current_usage_cpu allocated_cpu = min(avail_cpu, nxpus.ncpus) remaining_cpus = avail_cpu - allocated_cpu # for mono task service, allocate node with lowest cpu number if service.resource_multitask: better_cpu_usage = remaining_cpus > br_remaining_xpus.ncpus else: better_cpu_usage = remaining_cpus < br_remaining_xpus.ncpus if (allocated_cpu > 0 and (allocated_gpu != 0 or (allocated_cpu > br_available_xpus.ncpus) or (allocated_cpu == br_available_xpus.ncpus and better_cpu_usage))): idx = 0 for i in xrange(allocated_cpu): while self._redis.hget(keycr, str(idx)) is not None: idx += 1 assert idx <= capacity.ncpus, "invalid cpu alloc for %s" % keycr self._redis.hset(keycr, str(idx), task_id) else: return False, False if allocated_gpu < nxpus.ngpus or allocated_cpu < nxpus.ncpus: self._redis.set(key_reserved, task_id) return Capacity(allocated_gpu, allocated_cpu), Capacity(remaining_gpus, remaining_cpus)
def __init__(self, current_usage, entity_name, usage_coeff): self._entity = entity_name self._current_usage_capacity = current_usage if current_usage else Capacity( ) self._usage_coeff = usage_coeff
def add_task(self, task_id, redis): if task not in self._tasks: redis_key = 'task:%s' % task_id task_capacity = Capacity(redis.hget(redis_key, 'ngpus'), redis.hget(redis_key, 'ncpus')) self._tasks[task_id] = task_capacity
def _reserve_resource(self, service, resource, capacity, task_id, task_asked_capacity, br_remaining_xpus, br_priority, resource_priority): """Reserves the resource for task_id, if possible. The resource is locked while we try to reserve it. Resource should have more gpus available (within ngpus) than br_available_xpus or the same number but a smaller size """ self._logger.debug('service.name = %s', service.name) self._logger.debug('resource = %s', resource) self._logger.debug('capacity = (%d, %d)', capacity.ngpus, capacity.ncpus) self._logger.debug('task_id = %s', task_id) self._logger.debug('nxpus = (%d, %d)', task_asked_capacity.ngpus, task_asked_capacity.ncpus) self._logger.debug('br_remaining_xpus = (%d, %d)', br_remaining_xpus.ngpus, br_remaining_xpus.ncpus) for idx, val in enumerate(capacity): if val < task_asked_capacity[idx]: return None, None keygr = 'gpu_resource:%s:%s' % (service.name, resource) keycr = 'cpu_resource:%s:%s' % (service.name, resource) key_busy = 'busy:%s:%s' % (service.name, resource) with self._redis.acquire_lock(keygr): if self._redis.get(key_busy) is not None: return None, None # if we need gpus remaining_gpus = 0 remaining_cpus = 0 # allocate GPU first. For GPU we want to minimise the fragmentation, so minimize # br_remainining_xpus.ngpus current_usage_cpu = self._redis.hlen(keycr) self._logger.debug('current_usage_cpu = %d', current_usage_cpu) if current_usage_cpu > 0 and not _is_resource_multitask( service, resource): return None, None avail_cpu = capacity.ncpus - current_usage_cpu if task_asked_capacity.ncpus > avail_cpu: return None, None if task_asked_capacity.ngpus != 0: # do not allocate several run on the same GPU current_usage_gpu = self._redis.hlen(keygr) self._logger.debug('current_usage_gpu = %d', current_usage_gpu) if current_usage_gpu > 0 and not _is_resource_multitask( service, resource): return None, None # available gpu is the capacity of the node less number of gpu used avail_gpu = capacity.ngpus - current_usage_gpu self._logger.debug('avail_gpu = %d', avail_gpu) if task_asked_capacity.ngpus > avail_gpu: return None, None remaining_gpus = avail_gpu - task_asked_capacity.ngpus self._logger.debug('remaining_gpus = %d', remaining_gpus) if br_remaining_xpus.ngpus != -1 and remaining_gpus >= br_remaining_xpus.ngpus and \ resource_priority == br_priority: return None, None remaining_cpus = avail_cpu - task_asked_capacity.ncpus self._logger.debug('remaining_cpus = %d', remaining_cpus) # allocate node with higher resource priority # if priority for resources is equal, for mono task service, allocate node with lowest cpu number if resource_priority != br_priority: better_cpu_usage = resource_priority > br_priority elif _is_resource_multitask(service, resource): better_cpu_usage = remaining_cpus > br_remaining_xpus.ncpus else: better_cpu_usage = remaining_cpus < br_remaining_xpus.ncpus if br_remaining_xpus.ncpus != -1 and not better_cpu_usage: return None, None idx = 1 for _ in range(task_asked_capacity.ngpus): while self._redis.hget(keygr, str(idx)) is not None: idx += 1 assert idx <= capacity.ngpus, "invalid gpu alloc for %s" % keygr self._logger.debug('reserve GPU idx = %d', idx) self._redis.hset(keygr, str(idx), task_id) cpu_idx = 0 for _ in range(task_asked_capacity.ncpus): while self._redis.hget(keycr, str(cpu_idx)) is not None: cpu_idx += 1 assert cpu_idx + 1 <= capacity.ncpus, "invalid cpu alloc for %s" % keycr self._logger.debug('reserve CPU idx = %d', cpu_idx) self._redis.hset(keycr, str(cpu_idx), task_id) return Capacity(remaining_gpus, remaining_cpus), resource_priority
def _handle_allocated_task(self, task_id): keyt = 'task:%s' % task_id _, service = self._get_service(keyt=keyt) content = json.loads(self._redis.hget(keyt, 'content')) resource = self._redis.hget(keyt, 'alloc_resource') self._logger.info('%s: launching on %s', task_id, service.name) try: entity_config = self._get_current_config(task_id) keygr = 'gpu_resource:%s:%s' % (service.name, resource) lgpu = [] for k, v in six.iteritems(self._redis.hgetall(keygr)): if v == task_id: lgpu.append(k) self._redis.hset(keyt, 'alloc_lgpu', ",".join(lgpu)) keycr = 'cpu_resource:%s:%s' % (service.name, resource) lcpu = [] for k, v in six.iteritems(self._redis.hgetall(keycr)): if v == task_id: lcpu.append(k) self._redis.hset(keyt, 'alloc_lcpu', ",".join(lcpu)) data = service.launch( task_id, content['options'], (lgpu, lcpu), resource, entity_config["storages"], entity_config["docker"], content['docker']['registry'], content['docker']['image'], content['docker']['tag'], content['docker']['command'], task.file_list(self._taskfile_dir, task_id), content['wait_after_launch'], self._redis.hget(keyt, 'token'), content.get('support_statistics')) except EnvironmentError as e: # the resource is not available and will be set busy self._block_resource(resource, service, str(e)) self._redis.hdel(keyt, 'alloc_resource') # set the task as queued again self._release_resource( service, resource, task_id, Capacity(self._redis.hget(keyt, 'ngpus'), self._redis.hget(keyt, 'ncpus'))) status = self._redis.hget(keyt, 'status') if status == 'terminating': return None task.set_status(self._redis, keyt, 'queued') task.service_queue(self._redis, task_id, service.name) self._logger.info( 'could not launch [%s] %s on %s: blocking resource', str(e), task_id, resource) self._logger.info(traceback.format_exc()) return None except Exception as e: # all other errors make the task fail self._logger.info('fail task [%s] - %s', task_id, str(e)) self._logger.info(traceback.format_exc()) task.append_log(self._taskfile_dir, task_id, str(e)) auth_token = self._redis.hget(keyt, 'token') callback_url = service._config.get('callback_url') if auth_token: callback_url = callback_url.replace("://", "://" + auth_token + ":x@") r = requests.get(os.path.join(callback_url, "task/terminate", task_id), params={'phase': 'launch_error'}) if r.status_code != 200: raise RuntimeError( 'incorrect result from \'task/terminate\' service: %s' % r.text) from e task.terminate(self._redis, task_id, phase='launch_error') self._logger.info(traceback.format_exc()) return None self._logger.info('%s: task started on %s', task_id, service.name) self._redis.hset(keyt, 'job', json.dumps(data)) status = self._redis.hget(keyt, 'status') if status == 'terminating': return None task.set_status(self._redis, keyt, 'running') # For services that do not notify their activity, we should # poll the task status more regularly. task.work_queue(self._redis, task_id, service.name, delay=service.is_notifying_activity and 120 or 30) return None
def launch(service): pool_entity = service[0:2].upper() if not has_ability(flask.g, "train", pool_entity): abort(make_response(jsonify(message="insufficient credentials for train " "(entity %s)" % pool_entity), 403)) current_configuration_name = redis.hget("admin:service:%s" % service, "current_configuration") configurations = json.loads(redis.hget("admin:service:%s" % service, "configurations")) current_configuration = json.loads(configurations[current_configuration_name][1]) content = flask.request.form.get('content') if content is not None: content = json.loads(content) else: abort(flask.make_response(flask.jsonify(message="missing content in request"), 400)) files = {} for k in flask.request.files: files[k] = flask.request.files[k].read() service_module = get_service(service) content["service"] = service exec_mode = content.get('exec_mode', False) if not exec_mode: task_type = '????' if "train" in content["docker"]["command"]: task_type = "train" elif "trans" in content["docker"]["command"]: task_type = "trans" elif "preprocess" in content["docker"]["command"]: task_type = "prepr" elif "release" in content["docker"]["command"]: task_type = "relea" elif "buildvocab" in content["docker"]["command"]: task_type = "vocab" else: task_type = 'exec' if task_type == '????': abort(flask.make_response(flask.jsonify(message="incorrect task definition"), 400)) elif task_type != "exec": task_suffix = task_type else: task_suffix = get_docker_action(content["docker"]["command"]) if task_suffix is None: task_suffix = task_type # Sanity check on content. if 'options' not in content or not isinstance(content['options'], dict): abort(flask.make_response(flask.jsonify(message="invalid options field"), 400)) if 'docker' not in content: abort(flask.make_response(flask.jsonify(message="missing docker field"), 400)) if ('image' not in content['docker'] or 'registry' not in content['docker'] or 'tag' not in content['docker'] or 'command' not in content['docker']): abort(flask.make_response(flask.jsonify(message="incomplete docker field"), 400)) if content['docker']['registry'] == 'auto': content['docker']['registry'] = _get_registry(service_module, content['docker']['image']) elif content['docker']['registry'] not in service_module._config['docker']['registries']: abort(flask.make_response(flask.jsonify(message="unknown docker registry"), 400)) resource = service_module.get_resource_from_options(content["options"]) iterations = 1 if "iterations" in content: iterations = content["iterations"] if exec_mode: abort(flask.make_response(flask.jsonify(message="chain mode unavailable in exec mode"), 400)) if (task_type != "train" and iterations != 1) or iterations < 1: abort(flask.make_response(flask.jsonify(message="invalid value for iterations"), 400)) ngpus = 1 if "ngpus" in content: ngpus = content["ngpus"] ncpus = content.get("ncpus") # check that we have a resource able to run such a request if not _find_compatible_resource(service_module, ngpus, ncpus, resource): abort(flask.make_response( flask.jsonify(message="no resource available on %s for %d gpus (%s cpus)" % (service, ngpus, ncpus and str(ncpus) or "-")), 400)) if "totranslate" in content: if exec_mode: abort(flask.make_response(flask.jsonify(message="translate mode unavailable for exec cmd"), 400)) totranslate = content["totranslate"] del content["totranslate"] else: totranslate = None if "toscore" in content: if exec_mode: abort(flask.make_response(flask.jsonify(message="score mode unavailable for exec cmd"), 400)) toscore = content["toscore"] del content["toscore"] else: toscore = None if "totuminer" in content: if exec_mode: abort(flask.make_response(flask.jsonify(message="tuminer chain mode unavailable for exec cmd"), 400)) totuminer = content["totuminer"] del content["totuminer"] else: totuminer = None docker_version = content['docker']['tag'] if docker_version.startswith('v'): docker_version = docker_version[1:] try: chain_prepr_train = (not exec_mode and not content.get("nochainprepr", False) and task_type == "train" and semver.match(docker_version, ">=1.4.0")) can_trans_as_release = semver.match(docker_version, ">=1.8.0") trans_as_release = (not exec_mode and not content.get("notransasrelease", False) and semver.match(docker_version, ">=1.8.0")) content["support_statistics"] = semver.match(docker_version, ">=1.17.0") except ValueError as err: # could not match docker_version - not valid semver chain_prepr_train = False trans_as_release = False priority = content.get("priority", 0) (xxyy, parent_task_id) = shallow_command_analysis(content["docker"]["command"]) parent_struct = None parent_task_type = None if not exec_mode and parent_task_id: (parent_struct, parent_task_type) = model_name_analysis(parent_task_id) # check that parent model type matches current command if parent_task_type: if (parent_task_type == "trans" or parent_task_type == "relea" or (task_type == "prepr" and parent_task_type != "train" and parent_task_type != "vocab")): abort(flask.make_response(flask.jsonify(message="invalid parent task type: %s" % (parent_task_type)), 400)) task_ids = [] task_create = [] while iterations > 0: if (chain_prepr_train and parent_task_type != "prepr") or task_type == "prepr": prepr_task_id, explicitname = build_task_id(content, xxyy, "prepr", parent_task_id) if explicitname: patch_config_explicitname(content, explicitname) idx = 0 prepr_command = [] train_command = content["docker"]["command"] while train_command[idx] != 'train' and train_command[idx] != 'preprocess': prepr_command.append(train_command[idx]) idx += 1 # create preprocess command, don't push the model on the catalog, # and generate a pseudo model prepr_command.append("--no_push") prepr_command.append("preprocess") prepr_command.append("--build_model") content["docker"]["command"] = prepr_command content["ncpus"] = ncpus or \ get_cpu_count(current_configuration, 0, "preprocess") content["ngpus"] = 0 preprocess_resource = service_module.select_resource_from_capacity( resource, Capacity(content["ngpus"], content["ncpus"])) # launch preprocess task on cpus only task_create.append( (redis, taskfile_dir, prepr_task_id, "prepr", parent_task_id, preprocess_resource, service, _duplicate_adapt(service_module, content), files, priority, 0, content["ncpus"], {})) task_ids.append("%s\t%s\tngpus: %d, ncpus: %d" % ("prepr", prepr_task_id, 0, content["ncpus"])) remove_config_option(train_command) change_parent_task(train_command, prepr_task_id) parent_task_id = prepr_task_id content["docker"]["command"] = train_command if task_type != "prepr": task_id, explicitname = build_task_id(content, xxyy, task_suffix, parent_task_id) if explicitname: patch_config_explicitname(content, explicitname) file_to_transtaskid = {} if task_type == "trans": try: idx = content["docker"]["command"].index("trans") output_files = get_params(("-o", "--output"), content["docker"]["command"][idx+1:]) for ofile in output_files: file_to_transtaskid[ofile] = task_id except Exception: pass content["ncpus"] = ncpus or \ get_cpu_count(current_configuration, ngpus, task_type) content["ngpus"] = ngpus if task_type == "trans" and can_trans_as_release: if "--as_release" not in content["docker"]["command"] and trans_as_release: content["docker"]["command"].append("--as_release") content["ngpus"] = ngpus = 0 task_resource = service_module.select_resource_from_capacity( resource, Capacity(content["ngpus"], content["ncpus"])) task_create.append( (redis, taskfile_dir, task_id, task_type, parent_task_id, task_resource, service, _duplicate_adapt(service_module, content), files, priority, content["ngpus"], content["ncpus"], {})) task_ids.append("%s\t%s\tngpus: %d, ncpus: %d" % ( task_type, task_id, content["ngpus"], content["ncpus"])) parent_task_type = task_type[:5] remove_config_option(content["docker"]["command"]) if totranslate: content_translate = deepcopy(content) content_translate["priority"] = priority + 1 if trans_as_release: content_translate["ngpus"] = 0 else: content_translate["ngpus"] = min(ngpus, 1) content_translate["ncpus"] = ncpus or \ get_cpu_count(current_configuration, content_translate["ngpus"], "trans") translate_resource = service_module.select_resource_from_capacity( resource, Capacity(content_translate["ngpus"], content_translate["ncpus"])) if ngpus == 0 or trans_as_release: file_per_gpu = len(totranslate) else: file_per_gpu = (len(totranslate)+ngpus-1) / ngpus subset_idx = 0 while subset_idx * file_per_gpu < len(totranslate): content_translate["docker"]["command"] = ["trans"] if trans_as_release: content_translate["docker"]["command"].append("--as_release") content_translate["docker"]["command"].append('-i') subset_totranslate = totranslate[subset_idx*file_per_gpu: (subset_idx+1)*file_per_gpu] for f in subset_totranslate: content_translate["docker"]["command"].append(f[0]) change_parent_task(content_translate["docker"]["command"], task_id) trans_task_id, explicitname = build_task_id(content_translate, xxyy, "trans", task_id) content_translate["docker"]["command"].append('-o') for f in subset_totranslate: ofile = f[1].replace('<MODEL>', task_id) file_to_transtaskid[ofile] = trans_task_id content_translate["docker"]["command"].append(ofile) task_create.append( (redis, taskfile_dir, trans_task_id, "trans", task_id, translate_resource, service, _duplicate_adapt(service_module, content_translate), (), content_translate["priority"], content_translate["ngpus"], content_translate["ncpus"], {})) task_ids.append("%s\t%s\tngpus: %d, ncpus: %d" % ( "trans", trans_task_id, content_translate["ngpus"], content_translate["ncpus"])) subset_idx += 1 if toscore: toscore_parent = {} for (ofile, rfile) in toscore: ofile = ofile.replace('<MODEL>', task_id) parent_task_id = file_to_transtaskid.get(ofile) if parent_task_id: if parent_task_id not in toscore_parent: toscore_parent[parent_task_id] = {"output": [], "ref": []} ofile_split = ofile.split(':') if len(ofile_split) == 2 and ofile_split[0] == 'launcher': ofile = 'launcher:../' + parent_task_id + "/" + ofile_split[1] toscore_parent[parent_task_id]["output"].append(ofile) toscore_parent[parent_task_id]["ref"].append(rfile) for parent_task_id, oref in six.iteritems(toscore_parent): content_score = deepcopy(content) content_score["priority"] = priority + 1 content_score["ngpus"] = 0 content_score["ncpus"] = 1 score_resource = service_module.select_resource_from_capacity(resource, Capacity(0, 1)) image_score = "nmtwizard/score" option_lang = [] if parent_struct is not None: option_lang.append('-l') option_lang.append(parent_struct['xxyy'][-2:]) content_score["docker"] = { "image": image_score, "registry": _get_registry(service_module, image_score), "tag": "latest", "command": ["score", "-o"] + oref["output"] + ["-r"] + oref["ref"] + option_lang + ['-f', "launcher:scores"] } score_task_id, explicitname = build_task_id(content_score, xxyy, "score", parent_task_id) task_create.append( (redis, taskfile_dir, score_task_id, "exec", parent_task_id, score_resource, service, content_score, (), priority+2, 0, 1, {})) task_ids.append("%s\t%s\tngpus: %d, ncpus: %d" % ( "score", score_task_id, 0, 1)) if totuminer: # tuminer can run in CPU only mode, but it will be very slow for large data ngpus_recommend = ngpus ncpus_recommend = ncpus totuminer_parent = {} for (ifile, ofile) in totuminer: #ofile = ofile.replace('<MODEL>', task_id) parent_task_id = file_to_transtaskid.get(ofile) if parent_task_id: if parent_task_id not in totuminer_parent: totuminer_parent[parent_task_id] = {"infile": [], "outfile": [], "scorefile": []} ofile_split = ofile.split(':') if len(ofile_split) == 2 and ofile_split[0] == 'launcher': ofile = 'launcher:../' + parent_task_id + "/" + ofile_split[1] totuminer_parent[parent_task_id]["infile"].append(ifile) totuminer_parent[parent_task_id]["outfile"].append(ofile) scorefile = ofile if scorefile.endswith(".gz"): scorefile = scorefile[:-3] totuminer_parent[parent_task_id]["scorefile"].append(scorefile[:-3]) for parent_task_id, in_out in six.iteritems(totuminer_parent): content_tuminer = deepcopy(content) content_tuminer["priority"] = priority + 1 content_tuminer["ngpus"] = ngpus_recommend content_tuminer["ncpus"] = ncpus_recommend tuminer_resource = service_module.select_resource_from_capacity(resource, Capacity(ngpus_recommend, ncpus_recommend)) image_score = "nmtwizard/tuminer" content_tuminer["docker"] = { "image": image_score, "registry": _get_registry(service_module, image_score), "tag": "latest", "command": ["tuminer", "--tumode", "score", "--srcfile"] + in_out["infile"] + ["--tgtfile"] + in_out["outfile"]+ ["--output"] + in_out["scorefile"] } tuminer_task_id, explicitname = build_task_id(content_tuminer, xxyy, "tuminer", parent_task_id) task_create.append( (redis, taskfile_dir, tuminer_task_id, "exec", parent_task_id, tuminer_resource, service, content_tuminer, (), priority+2, ngpus_recommend, ncpus_recommend, {})) task_ids.append("%s\t%s\tngpus: %d, ncpus: %d" % ( "tuminer", tuminer_task_id, ngpus_recommend, ncpus_recommend)) iterations -= 1 if iterations > 0: parent_task_id = task_id change_parent_task(content["docker"]["command"], parent_task_id) (task_ids, task_create) = post_function('POST/task/launch', task_ids, task_create) for tc in task_create: task.create(*tc) if len(task_ids) == 1: task_ids = task_ids[0] return flask.jsonify(task_ids)
def _service_unqueue(self, service): """find the best next task to push to the work queue """ with self._redis.acquire_lock('service:' + service.name): queue = 'queued:%s' % service.name count = self._redis.llen(queue) idx = 0 preallocated_task_count = {} preallocated_task_resource = {} avail_resource = {} resources = service.list_resources() reserved = {} # list free cpu/gpus on each node for resource in resources: current_xpu_usage = Capacity() capacity = resources[resource] keygr = 'gpu_resource:%s:%s' % (self._service, resource) keycr = 'cpu_resource:%s:%s' % (self._service, resource) key_reserved = 'reserved:%s:%s' % (service.name, resource) gpu_tasks = self._redis.hgetall(keygr) cpu_tasks = self._redis.hgetall(keycr) task_reserved = self._redis.get(key_reserved) # can not launch multiple tasks on service with no multi-tasking (ec2) if not service.resource_multitask and \ not task_reserved and \ (gpu_tasks or cpu_tasks): continue for k, v in six.iteritems(gpu_tasks): if v in preallocated_task_count: preallocated_task_count[v].incr_ngpus(1) else: preallocated_task_count[v] = Capacity(ngpus=1) preallocated_task_resource[v] = resource current_xpu_usage.incr_ngpus(1) for k, v in six.iteritems(cpu_tasks): if v in preallocated_task_count: preallocated_task_count[v].incr_ncpus(1) else: preallocated_task_count[v] = Capacity(ncpus=1) preallocated_task_resource[v] = resource current_xpu_usage.incr_ncpus(1) available_xpus = capacity - current_xpu_usage avail_resource[resource] = available_xpus reserved[resource] = task_reserved self._logger.debug("\tresource %s - reserved: %s - free %s", resource, task_reserved or "False", available_xpus) if len(avail_resource) == 0: return # Go through the tasks, find if there are tasks that can be launched and # queue the best one best_task_id = None best_task_priority = -10000 best_task_queued_time = 0 while count > 0: count -= 1 next_task_id = self._redis.lindex(queue, count) if next_task_id is not None: next_keyt = 'task:%s' % next_task_id # self._logger.debug("\tcheck task: %s", next_task_id) parent = self._redis.hget(next_keyt, 'parent') # check parent dependency if parent: keyp = 'task:%s' % parent if self._redis.exists(keyp): # if the parent task is in the database, check for dependencies parent_status = self._redis.hget(keyp, 'status') if parent_status != 'stopped': if parent_status == 'running': # parent is still running so update queued time to be as close # as possible to terminate time of parent task self._redis.hset(next_keyt, "queued_time", time.time()) continue else: if self._redis.hget(keyp, 'message') != 'completed': task.terminate(self._redis, next_task_id, phase='dependency_error') continue nxpus = Capacity(self._redis.hget(next_keyt, 'ngpus'), self._redis.hget(next_keyt, 'ncpus')) foundResource = False if next_task_id in preallocated_task_count: # if task is pre-allocated, can only continue on the same node r = preallocated_task_resource[next_task_id] nxpus -= preallocated_task_count[next_task_id] avail_r = avail_resource[r] foundResource = (nxpus.ngpus == 0 and avail_r.ncpus != 0) or (nxpus.ngpus != 0 and avail_r.ngpus != 0) else: # can the task be launched on any node for r, v in six.iteritems(avail_resource): # cannot launch a new task on a reserved node if reserved[r]: continue if ((nxpus.ngpus > 0 and resources[r].ngpus >= nxpus.ngpus and v.ngpus > 0) or (nxpus.ngpus == 0 and v.ncpus >= 0)): foundResource = True break if not foundResource: continue priority = int(self._redis.hget(next_keyt, 'priority')) queued_time = float( self._redis.hget(next_keyt, 'queued_time')) if priority > best_task_priority or ( priority == best_task_priority and best_task_queued_time > queued_time): best_task_priority = priority best_task_id = next_task_id best_task_queued_time = queued_time if best_task_id: self._logger.info('selected %s to be launched on %s', best_task_id, service.name) task.work_queue(self._redis, best_task_id, service.name) self._redis.lrem(queue, 0, best_task_id)
def _advance_task(self, task_id): """Tries to advance the task to the next status. If it can, re-queue it immediately to process the next stage. Otherwise, re-queue it after some delay to try again. """ keyt = 'task:%s' % task_id with self._redis.acquire_lock(keyt, acquire_timeout=1, expire_time=600): status = self._redis.hget(keyt, 'status') if status == 'stopped': return service_name = self._redis.hget(keyt, 'service') if service_name not in self._services: raise ValueError('unknown service %s' % service_name) service = self._services[service_name] self._logger.info('%s: trying to advance from status %s', task_id, status) if status == 'queued': resource = self._redis.hget(keyt, 'resource') parent = self._redis.hget(keyt, 'parent') if parent: keyp = 'task:%s' % parent # if the parent task is in the database, check for dependencies if self._redis.exists(keyp): status = self._redis.hget(keyp, 'status') if status == 'stopped': if self._redis.hget(keyp, 'message') != 'completed': task.terminate(self._redis, task_id, phase='dependency_error') return else: self._logger.warning( '%s: depending on other task, waiting', task_id) task.service_queue(self._redis, task_id, service.name) return nxpus = Capacity(self._redis.hget(keyt, 'ngpus'), self._redis.hget(keyt, 'ncpus')) resource, available_xpus = self._allocate_resource( task_id, resource, service, nxpus) if resource is not None: self._logger.info('%s: resource %s reserved %s/%s', task_id, resource, available_xpus, nxpus) self._redis.hset(keyt, 'alloc_resource', resource) if nxpus == available_xpus: task.set_status(self._redis, keyt, 'allocated') else: task.set_status(self._redis, keyt, 'allocating') task.work_queue(self._redis, task_id, service_name) else: self._logger.warning('%s: no resources available, waiting', task_id) task.service_queue(self._redis, task_id, service.name) elif status == 'allocating': resource = self._redis.hget(keyt, 'alloc_resource') nxpus = Capacity(self._redis.hget(keyt, 'ngpus'), self._redis.hget(keyt, 'ncpus')) already_allocated_xpus = Capacity() keygr = 'gpu_resource:%s:%s' % (service.name, resource) for k, v in six.iteritems(self._redis.hgetall(keygr)): if v == task_id: already_allocated_xpus.incr_ngpus(1) keycr = 'cpu_resource:%s:%s' % (service.name, resource) for k, v in six.iteritems(self._redis.hgetall(keycr)): if v == task_id: already_allocated_xpus.incr_ncpus(1) capacity = service.list_resources()[resource] available_xpus, remaining_xpus = self._reserve_resource( service, resource, capacity, task_id, nxpus - already_allocated_xpus, Capacity(), Capacity(-1, -1), True) self._logger.info( 'task: %s - resource: %s (capacity %s)- already %s - available %s', task_id, resource, capacity, already_allocated_xpus, available_xpus) if available_xpus and available_xpus == nxpus - already_allocated_xpus: task.set_status(self._redis, keyt, 'allocated') key_reserved = 'reserved:%s:%s' % (service.name, resource) self._redis.delete(key_reserved) task.work_queue(self._redis, task_id, service.name) else: task.work_queue(self._redis, task_id, service.name, delay=20) elif status == 'allocated': content = json.loads(self._redis.hget(keyt, 'content')) resource = self._redis.hget(keyt, 'alloc_resource') self._logger.info('%s: launching on %s', task_id, service.name) try: keygr = 'gpu_resource:%s:%s' % (service.name, resource) lgpu = [] for k, v in six.iteritems(self._redis.hgetall(keygr)): if v == task_id: lgpu.append(k) self._redis.hset(keyt, 'alloc_lgpu', ",".join(lgpu)) keycr = 'cpu_resource:%s:%s' % (service.name, resource) lcpu = [] for k, v in six.iteritems(self._redis.hgetall(keycr)): if v == task_id: lcpu.append(k) self._redis.hset(keyt, 'alloc_lcpu', ",".join(lcpu)) data = service.launch( task_id, content['options'], (lgpu, lcpu), resource, content['docker']['registry'], content['docker']['image'], content['docker']['tag'], content['docker']['command'], task.file_list(self._redis, self._taskfile_dir, task_id), content['wait_after_launch'], self._redis.hget(keyt, 'token'), content.get('support_statistics')) except EnvironmentError as e: # the resource is not available and will be set busy self._block_resource(resource, service, str(e)) self._redis.hdel(keyt, 'alloc_resource') # set the task as queued again self._release_resource( service, resource, task_id, Capacity(self._redis.hget(keyt, 'ngpus'), self._redis.hget(keyt, 'ncpus'))) task.set_status(self._redis, keyt, 'queued') task.service_queue(self._redis, task_id, service.name) self._logger.info( 'could not launch [%s] %s on %s: blocking resource', str(e), task_id, resource) return except Exception as e: # all other errors make the task fail self._logger.info('fail task [%s] - %s', task_id, str(e)) task.append_log(self._redis, self._taskfile_dir, task_id, str(e)) task.terminate(self._redis, task_id, phase='launch_error') return self._logger.info('%s: task started on %s', task_id, service.name) self._redis.hset(keyt, 'job', json.dumps(data)) task.set_status(self._redis, keyt, 'running') # For services that do not notify their activity, we should # poll the task status more regularly. task.work_queue(self._redis, task_id, service.name, delay=service.is_notifying_activity and 120 or 30) elif status == 'running': self._logger.debug('- checking activity of task: %s', task_id) data = json.loads(self._redis.hget(keyt, 'job')) try: status = service.status(task_id, data) except Exception as e: self._logger.info('cannot get status for [%s] - %s', task_id, str(e)) self._redis.hincrby(keyt, 'status_fail', 1) if self._redis.hget(keyt, 'status_fail') > 4: task.terminate(self._redis, task_id, phase='lost_connection') return else: self._redis.hdel(keyt, 'status_fail') if status == 'dead': self._logger.info( '%s: task no longer running on %s, request termination', task_id, service.name) task.terminate(self._redis, task_id, phase='exited') else: task.work_queue(self._redis, task_id, service.name, delay=service.is_notifying_activity and 600 or 120) elif status == 'terminating': data = self._redis.hget(keyt, 'job') nxpus = Capacity(self._redis.hget(keyt, 'ngpus'), self._redis.hget(keyt, 'ncpus')) if data is not None: container_id = self._redis.hget(keyt, 'container_id') data = json.loads(data) data['container_id'] = container_id self._logger.info('%s: terminating task (job: %s)', task_id, json.dumps(data)) try: service.terminate(data) self._logger.info('%s: terminated', task_id) except Exception: self._logger.warning('%s: failed to terminate', task_id) else: self._logger.info('%s: terminating task (on error)', task_id) resource = self._redis.hget(keyt, 'alloc_resource') if resource: self._release_resource(service, resource, task_id, nxpus) task.set_status(self._redis, keyt, 'stopped') task.disable(self._redis, task_id)
def list_resources(self): return { _hostname(server): Capacity(len(server['gpus']), len(server['cpus'])) for server in self._config['variables']['server_pool'] }
from nmtwizard.capacity import Capacity # extracted from https://www.ec2instances.info - might need some update ec2_capacity_map = { "a1.2xlarge": Capacity(0, 8), "a1.xlarge": Capacity(0, 4), "a1.large": Capacity(0, 2), "a1.medium": Capacity(0, 1), "a1.4xlarge": Capacity(0, 16), "c1.xlarge": Capacity(0, 8), "c1.medium": Capacity(0, 2), "c3.2xlarge": Capacity(0, 8), "c3.8xlarge": Capacity(0, 32), "c3.xlarge": Capacity(0, 4), "c3.large": Capacity(0, 2), "c3.4xlarge": Capacity(0, 16), "c4.2xlarge": Capacity(0, 8), "c4.8xlarge": Capacity(0, 36), "c4.xlarge": Capacity(0, 4), "c4.large": Capacity(0, 2), "c4.4xlarge": Capacity(0, 16), "c5.18xlarge": Capacity(0, 72), "c5d.18xlarge": Capacity(0, 72), "c5.9xlarge": Capacity(0, 36), "c5d.9xlarge": Capacity(0, 36), "c5.2xlarge": Capacity(0, 8), "c5d.2xlarge": Capacity(0, 8), "c5.xlarge": Capacity(0, 4), "c5d.xlarge": Capacity(0, 4), "c5.large": Capacity(0, 2), "c5d.large": Capacity(0, 2),