Beispiel #1
0
            def load_machines(self, service_name):
                self.entities_usage = EntityUsage.initialize_entities_usage(
                    self.worker._mongo_client, service_name)
                for resource, machine in six.iteritems(self._machines):
                    current_xpu_usage = Capacity()
                    keygr = 'gpu_resource:%s:%s' % (self.worker._service,
                                                    resource)
                    keycr = 'cpu_resource:%s:%s' % (self.worker._service,
                                                    resource)

                    gpu_tasks = self.worker._redis.hgetall(keygr)
                    cpu_tasks = self.worker._redis.hgetall(keycr)

                    # can not launch multiple tasks on service with no multi-tasking (ec2)
                    # or launch multiple tasks on service with hybrid task mode and dynamic resource mode (nova)
                    if not _is_resource_multitask(
                            service, resource) and (gpu_tasks or cpu_tasks):
                        continue
                    tmp_tasks = {}
                    for _, v in six.iteritems(gpu_tasks):
                        if v not in tmp_tasks:
                            task_entity = task.get_owner_entity(
                                self.worker._redis, v)
                            tmp_tasks[v] = task_entity
                        else:
                            task_entity = tmp_tasks[v]

                        if v not in self.preallocated_task_resource:
                            self.preallocated_task_resource[v] = resource
                        self._machines[resource].add_task(
                            v, self.worker._redis)
                        current_xpu_usage.incr_ngpus(1)
                        self.entities_usage[task_entity].add_current_usage(
                            Capacity(ngpus=1))

                    for _, v in six.iteritems(cpu_tasks):
                        if v not in tmp_tasks:
                            task_entity = task.get_owner_entity(
                                self.worker._redis, v)
                            tmp_tasks[v] = task_entity
                        else:
                            task_entity = tmp_tasks[v]

                        if v not in self.preallocated_task_resource:
                            self.preallocated_task_resource[v] = resource

                        self._machines[resource].add_task(
                            v, self.worker._redis)
                        current_xpu_usage.incr_ncpus(1)
                        self.entities_usage[task_entity].add_current_usage(
                            Capacity(ncpus=1))

                    available_xpus = machine._init_capacity - current_xpu_usage
                    self._machines[resource].set_available(available_xpus)
                    self.worker._logger.debug("\tresource %s: - free %s",
                                              resource, available_xpus)

                return len(resource_mgr._machines) > 0
Beispiel #2
0
 def __init__(self, config):
     super().__init__(config)
     self._nova_client = init_nova_client(config)
     self._templates = []
     self._resources = {}
     self._machines = {}
     for template in config['variables']['template_pool']:
         instance_type = template['name']
         if instance_type not in ovh_capacity_map:
             raise ValueError('unknown instance type: %s' % instance_type)
         xpu = ovh_capacity_map[instance_type]
         try:
             flavor = self._nova_client.flavors.find(name=instance_type)
         except novaclient.exceptions.NotFound as e:
             raise e
         template["id"] = flavor.id
         template["name"] = flavor.name
         template["gpus"] = range(xpu.ngpus)
         template["cpus"] = range(xpu.ncpus)
         maxInstances = template.get("maxInstances", 1)
         self._templates.append(template)
         for idx in range(maxInstances):
             self._resources["%s:%d" % (template["name"], idx)] = \
                 Capacity(len(template["gpus"]), len(template["cpus"]))
             self._machines["%s:%d" % (template["name"], idx)] = template
     logger.info("Initialized OVH instance - found %d templates.",
                 len(config['variables']['template_pool']))
Beispiel #3
0
 def _handle_terminating_task(self, task_id):
     keyt = 'task:%s' % task_id
     _, service = self._get_service(keyt=keyt)
     data = self._redis.hget(keyt, 'job')
     nxpus = Capacity(self._redis.hget(keyt, 'ngpus'),
                      self._redis.hget(keyt, 'ncpus'))
     if data is not None:
         container_id = self._redis.hget(keyt, 'container_id')
         data = json.loads(data)
         data['container_id'] = container_id
         self._logger.info('%s: terminating task (job: %s)', task_id,
                           json.dumps(data))
         try:
             service.terminate(data)
             self._logger.info('%s: terminated', task_id)
         except Exception:
             self._logger.warning('%s: failed to terminate', task_id)
             self._logger.info(traceback.format_exc())
     else:
         self._logger.info('%s: terminating task (on error)', task_id)
     resource = self._redis.hget(keyt, 'alloc_resource')
     if resource:
         self._release_resource(service, resource, task_id, nxpus)
     task.set_status(self._redis, keyt, 'stopped')
     task.disable(self._redis, task_id)
Beispiel #4
0
 def __init__(self, service, name, initial_capacity, logger):
     self._init_capacity = initial_capacity
     self._name = name
     self._available_cap = Capacity()
     self._tasks = {}
     self._logger = logger
     self._service = service
Beispiel #5
0
 def list_resources(self):
     resources = {
         server: Capacity(len(self._machines[server]['gpus']),
                          len(self._machines[server]['cpus']))
         for server in self._machines
     }
     return resources
Beispiel #6
0
 def total_capacity(self):
     """Total capacity of the service (i.e. the total number of tasks that
     can run at the same time).
     """
     tc = Capacity()
     for v in six.itervalues(self.list_resources()):
         tc += v
     return tc
Beispiel #7
0
def _usagecapacity(service):
    """calculate the current usage of the service."""
    usage_xpu = Capacity()
    capacity_xpus = Capacity()
    busy = 0
    detail = {}
    for resource in service.list_resources():
        detail[resource] = {'busy': '', 'reserved': ''}
        r_capacity = service.list_resources()[resource]
        detail[resource]['capacity'] = r_capacity
        capacity_xpus += r_capacity
        reserved = redis.get("reserved:%s:%s" % (service.name, resource))
        if reserved:
            detail[resource]['reserved'] = reserved

        count_map_gpu = Counter()
        count_map_cpu = Counter()
        task_type = {}
        count_used_xpus = Capacity()

        r_usage_gpu = redis.hgetall("gpu_resource:%s:%s" %
                                    (service.name, resource)).values()
        for t in r_usage_gpu:
            if t not in task_type:
                task_type[t] = redis.hget("task:%s" % t, "type")
            count_map_gpu[t] += 1
            count_used_xpus.incr_ngpus(1)

        r_usage_cpu = redis.hgetall("cpu_resource:%s:%s" %
                                    (service.name, resource)).values()
        for t in r_usage_cpu:
            if t not in task_type:
                task_type[t] = redis.hget("task:%s" % t, "type")
            count_map_cpu[t] += 1
            count_used_xpus.incr_ncpus(1)

        detail[resource]['usage'] = [
            "%s %s: %d (%d)" %
            (task_type[t], t, count_map_gpu[t], count_map_cpu[t])
            for t in task_type
        ]
        detail[resource][
            'avail_gpus'] = r_capacity.ngpus - count_used_xpus.ngpus
        detail[resource][
            'avail_cpus'] = r_capacity.ncpus - count_used_xpus.ncpus
        err = redis.get("busy:%s:%s" % (service.name, resource))
        if err:
            detail[resource]['busy'] = err
            busy = busy + 1
        usage_xpu += count_used_xpus
    queued = redis.llen("queued:" + service.name)
    return ("%d (%d)" % (usage_xpu.ngpus, usage_xpu.ncpus), queued,
            "%d (%d)" % (capacity_xpus.ngpus, capacity_xpus.ncpus), busy,
            detail)
Beispiel #8
0
 def __init__(self,
              service,
              name,
              initial_capacity,
              logger,
              priority=None):
     self._init_capacity = initial_capacity
     self._name = name
     self._available_cap = Capacity()
     self._tasks = {}
     self._logger = logger
     self._service = service
     self._priority = priority if priority else 1
Beispiel #9
0
    def _allocate_resource(self, task_id, request_resource, service, nxpus):
        """Allocates a resource for task_id and returns the name of the resource
           (or None if none where allocated), and the number of allocated gpus/cpus
        """
        best_resource = None
        br_available_xpus = Capacity()
        br_remaining_xpus = Capacity(-1, -1)
        resources = service.list_resources()

        for name, capacity in six.iteritems(resources):
            if _compatible_resource(name, request_resource):
                available_xpus, remaining_xpus = self._reserve_resource(
                    service, name, capacity, task_id, nxpus, br_available_xpus,
                    br_remaining_xpus)
                if available_xpus is not False:
                    if best_resource is not None:
                        self._release_resource(service, best_resource, task_id,
                                               nxpus)
                    best_resource = name
                    br_remaining_xpus = remaining_xpus
                    br_available_xpus = available_xpus

        return best_resource, br_available_xpus
Beispiel #10
0
    def __init__(self, task_infos, must_patch_config_name=False):
        self._content = deepcopy(task_infos.content)
        self._lang_pair = f'{task_infos.request_data["source"]}{task_infos.request_data["target"]}'
        if not self._lang_pair and self._parent_task_id:
            self._lang_pair = self._parent_task_id.split("_")[1]
        self._service = task_infos.service
        self._service_config = task_infos.routes_configuration.service_config
        self._service_module = task_infos.routes_configuration.service_module
        self._files = task_infos.files
        self.other_task_info = {
            TaskEnum.ENTITY_OWNER.value:
            task_infos.routes_configuration.entity_owner,
            TaskEnum.STORAGE_ENTITIES.value:
            json.dumps(task_infos.routes_configuration.trainer_entities)
        }
        if task_infos.other_infos:
            self.update_other_infos(task_infos.other_infos)
        self._priority = self._content.get("priority", 0)
        self._resource = task_infos.resource

        if self._task_suffix:
            self.task_id, explicit_name = build_task_id(
                self._content, self._lang_pair, self._task_suffix,
                self._parent_task_id)
            if must_patch_config_name:
                TaskBase.patch_config_explicit_name(self._content,
                                                    explicit_name)

        if self._resource:
            self._resource = self._service_module.select_resource_from_capacity(
                self._resource,
                Capacity(self._content["ngpus"], self._content["ncpus"]))
        else:
            self._resource = self._service_module.select_resource_from_capacity(
                self._service_module.get_resource_from_options(
                    self._content["options"]),
                Capacity(self._content["ngpus"], self._content["ncpus"]))
Beispiel #11
0
            def try_create(next_task_id):
                next_keyt = 'task:%s' % next_task_id
                parent = self._redis.hget(next_keyt, 'parent')
                task_entity = task.get_owner_entity(self._redis, next_task_id)

                if task_entity not in resource_mgr.entities_usage:
                    self._logger.error(
                        "\t[Task %s] entity %s - without usage limit !",
                        next_task_id, task_entity)
                    return None

                # check parent dependency
                if parent:
                    keyp = 'task:%s' % parent
                    if self._redis.exists(keyp):
                        # if the parent task is in the database, check for dependencies
                        parent_status = self._redis.hget(keyp, 'status')
                        if parent_status != 'stopped':
                            if parent_status == 'running':
                                # parent is still running so update queued time to be as close
                                # as possible to terminate time of parent task
                                self._redis.hset(next_keyt, "queued_time",
                                                 time.time())
                            return None

                        if self._redis.hget(keyp, 'message') != 'completed':
                            task.terminate(self._redis,
                                           next_task_id,
                                           phase='dependency_error')
                            return None

                task_capacity = Capacity(self._redis.hget(next_keyt, 'ngpus'),
                                         self._redis.hget(next_keyt, 'ncpus'))
                candidate_task = CandidateTask(
                    next_task_id, task_entity, self._redis, task_capacity,
                    resource_mgr.entities_usage[task_entity], self._logger)
                # check now the task has a chance to be processed by any machine
                for _, machine in six.iteritems(resource_mgr._machines):
                    can_be_processed = machine._is_authorized(candidate_task._entity, candidate_task._capacity) \
                                       and candidate_task._capacity.inf_or_eq(machine._init_capacity)
                    if can_be_processed:
                        return candidate_task

                return None
Beispiel #12
0
 def _distribute_machine_for_task(self, task_id, task_entity,
                                  task_expected_capacity, request_resource,
                                  service, machines):
     best_resource = None
     br_remaining_xpus = Capacity(-1, -1)
     for name, machine in six.iteritems(machines):
         if _compatible_resource(
                 name, request_resource) and machine._is_authorized(
                     task_entity, task_expected_capacity):
             better_remaining_xpus = self._reserve_resource(
                 service, name, machine._init_capacity, task_id,
                 task_expected_capacity, br_remaining_xpus)
             if better_remaining_xpus is not None:
                 if best_resource is not None:
                     self._release_resource(service, best_resource, task_id,
                                            task_expected_capacity)
                 best_resource = name
                 br_remaining_xpus = better_remaining_xpus
     return best_resource
Beispiel #13
0
 def __init__(self, config):
     super().__init__(config)
     self._session = boto3.Session(
         aws_access_key_id=config["variables"]["awsAccessKeyId"],
         aws_secret_access_key=config["variables"]["awsSecretAccessKey"],
         region_name=config["variables"]["awsRegion"])
     ec2_client = self._session.client("ec2")
     self._templates = []
     self._resources = {}
     self._machines = {}
     for template in config['variables']['template_pool']:
         response = ec2_client.describe_launch_template_versions(
             DryRun=False,
             LaunchTemplateName=template['name'],
             Filters=[{
                 'Name': 'is-default-version',
                 'Values': ["true"]
             }])
         if not response or not response["LaunchTemplateVersions"]:
             raise ValueError('cannot retrieve launch template')
         template_description = response["LaunchTemplateVersions"][0]
         if "LaunchTemplateData" not in template_description:
             raise ValueError('invalid template_description')
         launch_template_data = template_description["LaunchTemplateData"]
         if "InstanceType" not in launch_template_data or \
                 launch_template_data["InstanceType"] not in ec2_capacity_map:
             raise ValueError('unknown instance type: %s' %
                              launch_template_data["InstanceType"])
         xpu = ec2_capacity_map[launch_template_data["InstanceType"]]
         maxInstances = template.get("maxInstances", 1)
         template["id"] = template_description["LaunchTemplateId"]
         template["name"] = template_description["LaunchTemplateName"]
         template["gpus"] = range(xpu.ngpus)
         template["cpus"] = range(xpu.ncpus)
         self._templates.append(template)
         for idx in range(maxInstances):
             self._resources["%s:%d" % (template["name"], idx)] = \
                 Capacity(len(template["gpus"]), len(template["cpus"]))
             self._machines["%s:%d" % (template["name"], idx)] = template
     logger.info("Initialized EC2 - found %d templates.",
                 len(config['variables']['template_pool']))
Beispiel #14
0
    def _reserve_resource(self,
                          service,
                          resource,
                          capacity,
                          task_id,
                          nxpus,
                          br_available_xpus,
                          br_remaining_xpus,
                          check_reserved=False):
        """Reserves the resource for task_id, if possible. The resource is locked
        while we try to reserve it.
        Resource should have more gpus available (within ngpus) than br_available_xpus
        or the same number but a smaller size
        """
        for idx, val in enumerate(capacity):
            if val < nxpus[idx]:
                return False, False

        keygr = 'gpu_resource:%s:%s' % (service.name, resource)
        keycr = 'cpu_resource:%s:%s' % (service.name, resource)
        key_busy = 'busy:%s:%s' % (service.name, resource)
        key_reserved = 'reserved:%s:%s' % (service.name, resource)
        with self._redis.acquire_lock(keygr):
            if self._redis.get(key_busy) is not None:
                return False, False
            # if we need gpus
            allocated_gpu = 0
            allocated_cpu = 0
            remaining_gpus = 0
            remaining_cpus = 0

            # allocate GPU first. For GPU we want to minimise the fragmentation, so minimize
            # br_remainining_xpus.ngpus
            if nxpus.ngpus != 0:
                # do not allocate several run on the same GPU
                current_usage_gpu = self._redis.hlen(keygr)
                if current_usage_gpu > 0 and not service.resource_multitask:
                    return False, False
                # available gpu is the capacity of the node less number of gpu used
                avail_gpu = capacity.ngpus - current_usage_gpu

                allocated_gpu = min(avail_gpu, nxpus.ngpus)
                remaining_gpus = avail_gpu - allocated_gpu
                if (allocated_gpu > 0
                        and ((allocated_gpu > br_available_xpus.ngpus) or
                             (allocated_gpu == br_available_xpus.ngpus
                              and remaining_gpus < br_remaining_xpus.ngpus))):
                    idx = 1
                    for i in xrange(allocated_gpu):
                        while self._redis.hget(keygr, str(idx)) is not None:
                            idx += 1
                            assert idx <= capacity.ngpus, "invalid gpu alloc for %s" % keygr
                        self._redis.hset(keygr, str(idx), task_id)
                else:
                    return False, False

            # if we don't need to allocate GPUs anymore, start allocating CPUs
            # * for CPU on multitask service we want to maximize the remaining CPU
            # to avoid loading too much individual servers
            # * for CPU on monotask service, we want to minimize the remaining CPU
            # to avoid loading on a over-dimensioned service
            if allocated_gpu == nxpus.ngpus and nxpus.ncpus != 0:
                current_usage_cpu = self._redis.hlen(keycr)
                if current_usage_cpu > 0 and not service.resource_multitask:
                    return False, False
                avail_cpu = capacity.ncpus - current_usage_cpu
                allocated_cpu = min(avail_cpu, nxpus.ncpus)
                remaining_cpus = avail_cpu - allocated_cpu

                # for mono task service, allocate node with lowest cpu number
                if service.resource_multitask:
                    better_cpu_usage = remaining_cpus > br_remaining_xpus.ncpus
                else:
                    better_cpu_usage = remaining_cpus < br_remaining_xpus.ncpus

                if (allocated_cpu > 0
                        and (allocated_gpu != 0 or
                             (allocated_cpu > br_available_xpus.ncpus) or
                             (allocated_cpu == br_available_xpus.ncpus
                              and better_cpu_usage))):
                    idx = 0
                    for i in xrange(allocated_cpu):
                        while self._redis.hget(keycr, str(idx)) is not None:
                            idx += 1
                            assert idx <= capacity.ncpus, "invalid cpu alloc for %s" % keycr
                        self._redis.hset(keycr, str(idx), task_id)
                else:
                    return False, False

            if allocated_gpu < nxpus.ngpus or allocated_cpu < nxpus.ncpus:
                self._redis.set(key_reserved, task_id)

            return Capacity(allocated_gpu,
                            allocated_cpu), Capacity(remaining_gpus,
                                                     remaining_cpus)
Beispiel #15
0
 def __init__(self, current_usage, entity_name, usage_coeff):
     self._entity = entity_name
     self._current_usage_capacity = current_usage if current_usage else Capacity(
     )
     self._usage_coeff = usage_coeff
Beispiel #16
0
 def add_task(self, task_id, redis):
     if task not in self._tasks:
         redis_key = 'task:%s' % task_id
         task_capacity = Capacity(redis.hget(redis_key, 'ngpus'),
                                  redis.hget(redis_key, 'ncpus'))
         self._tasks[task_id] = task_capacity
Beispiel #17
0
    def _reserve_resource(self, service, resource, capacity, task_id,
                          task_asked_capacity, br_remaining_xpus, br_priority,
                          resource_priority):
        """Reserves the resource for task_id, if possible. The resource is locked
        while we try to reserve it.
        Resource should have more gpus available (within ngpus) than br_available_xpus
        or the same number but a smaller size
        """
        self._logger.debug('service.name = %s', service.name)
        self._logger.debug('resource = %s', resource)
        self._logger.debug('capacity = (%d, %d)', capacity.ngpus,
                           capacity.ncpus)
        self._logger.debug('task_id = %s', task_id)
        self._logger.debug('nxpus = (%d, %d)', task_asked_capacity.ngpus,
                           task_asked_capacity.ncpus)
        self._logger.debug('br_remaining_xpus = (%d, %d)',
                           br_remaining_xpus.ngpus, br_remaining_xpus.ncpus)

        for idx, val in enumerate(capacity):
            if val < task_asked_capacity[idx]:
                return None, None

        keygr = 'gpu_resource:%s:%s' % (service.name, resource)
        keycr = 'cpu_resource:%s:%s' % (service.name, resource)
        key_busy = 'busy:%s:%s' % (service.name, resource)

        with self._redis.acquire_lock(keygr):
            if self._redis.get(key_busy) is not None:
                return None, None
            # if we need gpus
            remaining_gpus = 0
            remaining_cpus = 0

            # allocate GPU first. For GPU we want to minimise the fragmentation, so minimize
            # br_remainining_xpus.ngpus

            current_usage_cpu = self._redis.hlen(keycr)
            self._logger.debug('current_usage_cpu = %d', current_usage_cpu)
            if current_usage_cpu > 0 and not _is_resource_multitask(
                    service, resource):
                return None, None
            avail_cpu = capacity.ncpus - current_usage_cpu
            if task_asked_capacity.ncpus > avail_cpu:
                return None, None

            if task_asked_capacity.ngpus != 0:
                # do not allocate several run on the same GPU
                current_usage_gpu = self._redis.hlen(keygr)
                self._logger.debug('current_usage_gpu = %d', current_usage_gpu)
                if current_usage_gpu > 0 and not _is_resource_multitask(
                        service, resource):
                    return None, None
                # available gpu is the capacity of the node less number of gpu used
                avail_gpu = capacity.ngpus - current_usage_gpu
                self._logger.debug('avail_gpu = %d', avail_gpu)

                if task_asked_capacity.ngpus > avail_gpu:
                    return None, None

                remaining_gpus = avail_gpu - task_asked_capacity.ngpus
                self._logger.debug('remaining_gpus = %d', remaining_gpus)

                if br_remaining_xpus.ngpus != -1 and remaining_gpus >= br_remaining_xpus.ngpus and \
                        resource_priority == br_priority:
                    return None, None

            remaining_cpus = avail_cpu - task_asked_capacity.ncpus
            self._logger.debug('remaining_cpus = %d', remaining_cpus)

            # allocate node with higher resource priority
            # if priority for resources is equal, for mono task service, allocate node with lowest cpu number
            if resource_priority != br_priority:
                better_cpu_usage = resource_priority > br_priority
            elif _is_resource_multitask(service, resource):
                better_cpu_usage = remaining_cpus > br_remaining_xpus.ncpus
            else:
                better_cpu_usage = remaining_cpus < br_remaining_xpus.ncpus

            if br_remaining_xpus.ncpus != -1 and not better_cpu_usage:
                return None, None

            idx = 1
            for _ in range(task_asked_capacity.ngpus):
                while self._redis.hget(keygr, str(idx)) is not None:
                    idx += 1
                    assert idx <= capacity.ngpus, "invalid gpu alloc for %s" % keygr
                self._logger.debug('reserve GPU idx = %d', idx)
                self._redis.hset(keygr, str(idx), task_id)

            cpu_idx = 0
            for _ in range(task_asked_capacity.ncpus):
                while self._redis.hget(keycr, str(cpu_idx)) is not None:
                    cpu_idx += 1
                    assert cpu_idx + 1 <= capacity.ncpus, "invalid cpu alloc for %s" % keycr
                self._logger.debug('reserve CPU idx = %d', cpu_idx)
                self._redis.hset(keycr, str(cpu_idx), task_id)

            return Capacity(remaining_gpus, remaining_cpus), resource_priority
Beispiel #18
0
 def _handle_allocated_task(self, task_id):
     keyt = 'task:%s' % task_id
     _, service = self._get_service(keyt=keyt)
     content = json.loads(self._redis.hget(keyt, 'content'))
     resource = self._redis.hget(keyt, 'alloc_resource')
     self._logger.info('%s: launching on %s', task_id, service.name)
     try:
         entity_config = self._get_current_config(task_id)
         keygr = 'gpu_resource:%s:%s' % (service.name, resource)
         lgpu = []
         for k, v in six.iteritems(self._redis.hgetall(keygr)):
             if v == task_id:
                 lgpu.append(k)
         self._redis.hset(keyt, 'alloc_lgpu', ",".join(lgpu))
         keycr = 'cpu_resource:%s:%s' % (service.name, resource)
         lcpu = []
         for k, v in six.iteritems(self._redis.hgetall(keycr)):
             if v == task_id:
                 lcpu.append(k)
         self._redis.hset(keyt, 'alloc_lcpu', ",".join(lcpu))
         data = service.launch(
             task_id, content['options'], (lgpu, lcpu), resource,
             entity_config["storages"], entity_config["docker"],
             content['docker']['registry'], content['docker']['image'],
             content['docker']['tag'], content['docker']['command'],
             task.file_list(self._taskfile_dir,
                            task_id), content['wait_after_launch'],
             self._redis.hget(keyt, 'token'),
             content.get('support_statistics'))
     except EnvironmentError as e:
         # the resource is not available and will be set busy
         self._block_resource(resource, service, str(e))
         self._redis.hdel(keyt, 'alloc_resource')
         # set the task as queued again
         self._release_resource(
             service, resource, task_id,
             Capacity(self._redis.hget(keyt, 'ngpus'),
                      self._redis.hget(keyt, 'ncpus')))
         status = self._redis.hget(keyt, 'status')
         if status == 'terminating':
             return None
         task.set_status(self._redis, keyt, 'queued')
         task.service_queue(self._redis, task_id, service.name)
         self._logger.info(
             'could not launch [%s] %s on %s: blocking resource', str(e),
             task_id, resource)
         self._logger.info(traceback.format_exc())
         return None
     except Exception as e:
         # all other errors make the task fail
         self._logger.info('fail task [%s] - %s', task_id, str(e))
         self._logger.info(traceback.format_exc())
         task.append_log(self._taskfile_dir, task_id, str(e))
         auth_token = self._redis.hget(keyt, 'token')
         callback_url = service._config.get('callback_url')
         if auth_token:
             callback_url = callback_url.replace("://",
                                                 "://" + auth_token + ":x@")
         r = requests.get(os.path.join(callback_url, "task/terminate",
                                       task_id),
                          params={'phase': 'launch_error'})
         if r.status_code != 200:
             raise RuntimeError(
                 'incorrect result from \'task/terminate\' service: %s' %
                 r.text) from e
         task.terminate(self._redis, task_id, phase='launch_error')
         self._logger.info(traceback.format_exc())
         return None
     self._logger.info('%s: task started on %s', task_id, service.name)
     self._redis.hset(keyt, 'job', json.dumps(data))
     status = self._redis.hget(keyt, 'status')
     if status == 'terminating':
         return None
     task.set_status(self._redis, keyt, 'running')
     # For services that do not notify their activity, we should
     # poll the task status more regularly.
     task.work_queue(self._redis,
                     task_id,
                     service.name,
                     delay=service.is_notifying_activity and 120 or 30)
     return None
Beispiel #19
0
def launch(service):
    pool_entity = service[0:2].upper()
    if not has_ability(flask.g, "train", pool_entity):
        abort(make_response(jsonify(message="insufficient credentials for train "
                                            "(entity %s)" % pool_entity), 403))

    current_configuration_name = redis.hget("admin:service:%s" % service, "current_configuration")
    configurations = json.loads(redis.hget("admin:service:%s" % service, "configurations"))
    current_configuration = json.loads(configurations[current_configuration_name][1])

    content = flask.request.form.get('content')
    if content is not None:
        content = json.loads(content)
    else:
        abort(flask.make_response(flask.jsonify(message="missing content in request"), 400))

    files = {}
    for k in flask.request.files:
        files[k] = flask.request.files[k].read()

    service_module = get_service(service)
    content["service"] = service

    exec_mode = content.get('exec_mode', False)

    if not exec_mode:
        task_type = '????'
        if "train" in content["docker"]["command"]:
            task_type = "train"
        elif "trans" in content["docker"]["command"]:
            task_type = "trans"
        elif "preprocess" in content["docker"]["command"]:
            task_type = "prepr"
        elif "release" in content["docker"]["command"]:
            task_type = "relea"
        elif "buildvocab" in content["docker"]["command"]:
            task_type = "vocab"
    else:
        task_type = 'exec'

    if task_type == '????':
        abort(flask.make_response(flask.jsonify(message="incorrect task definition"), 400))

    elif task_type != "exec":
        task_suffix = task_type
    else:
        task_suffix = get_docker_action(content["docker"]["command"])
        if task_suffix is None:
            task_suffix = task_type

    # Sanity check on content.
    if 'options' not in content or not isinstance(content['options'], dict):
        abort(flask.make_response(flask.jsonify(message="invalid options field"), 400))
    if 'docker' not in content:
        abort(flask.make_response(flask.jsonify(message="missing docker field"), 400))
    if ('image' not in content['docker'] or 'registry' not in content['docker'] or
       'tag' not in content['docker'] or 'command' not in content['docker']):
        abort(flask.make_response(flask.jsonify(message="incomplete docker field"), 400))
    if content['docker']['registry'] == 'auto':
        content['docker']['registry'] = _get_registry(service_module, content['docker']['image'])
    elif content['docker']['registry'] not in service_module._config['docker']['registries']:
        abort(flask.make_response(flask.jsonify(message="unknown docker registry"), 400))

    resource = service_module.get_resource_from_options(content["options"])

    iterations = 1
    if "iterations" in content:
        iterations = content["iterations"]
        if exec_mode:
            abort(flask.make_response(flask.jsonify(message="chain mode unavailable in exec mode"), 400))
        if (task_type != "train" and iterations != 1) or iterations < 1:
            abort(flask.make_response(flask.jsonify(message="invalid value for iterations"), 400))

    ngpus = 1
    if "ngpus" in content:
        ngpus = content["ngpus"]
    ncpus = content.get("ncpus")

    # check that we have a resource able to run such a request
    if not _find_compatible_resource(service_module, ngpus, ncpus, resource):
        abort(flask.make_response(
                    flask.jsonify(message="no resource available on %s for %d gpus (%s cpus)" %
                                  (service, ngpus, ncpus and str(ncpus) or "-")), 400))

    if "totranslate" in content:
        if exec_mode:
            abort(flask.make_response(flask.jsonify(message="translate mode unavailable for exec cmd"), 400))
        totranslate = content["totranslate"]
        del content["totranslate"]
    else:
        totranslate = None
    if "toscore" in content:
        if exec_mode:
            abort(flask.make_response(flask.jsonify(message="score mode unavailable for exec cmd"), 400))
        toscore = content["toscore"]
        del content["toscore"]
    else:
        toscore = None
    if "totuminer" in content:
        if exec_mode:
            abort(flask.make_response(flask.jsonify(message="tuminer chain mode unavailable for exec cmd"), 400))
        totuminer = content["totuminer"]
        del content["totuminer"]
    else:
        totuminer = None

    docker_version = content['docker']['tag']
    if docker_version.startswith('v'):
        docker_version = docker_version[1:]
    try:
        chain_prepr_train = (not exec_mode and not content.get("nochainprepr", False) and
                             task_type == "train" and
                             semver.match(docker_version, ">=1.4.0"))
        can_trans_as_release = semver.match(docker_version, ">=1.8.0")
        trans_as_release = (not exec_mode and not content.get("notransasrelease", False) and
                            semver.match(docker_version, ">=1.8.0"))
        content["support_statistics"] = semver.match(docker_version, ">=1.17.0")
    except ValueError as err:
        # could not match docker_version - not valid semver
        chain_prepr_train = False
        trans_as_release = False

    priority = content.get("priority", 0)

    (xxyy, parent_task_id) = shallow_command_analysis(content["docker"]["command"])
    parent_struct = None
    parent_task_type = None
    if not exec_mode and parent_task_id:
        (parent_struct, parent_task_type) = model_name_analysis(parent_task_id)

    # check that parent model type matches current command
    if parent_task_type:
        if (parent_task_type == "trans" or parent_task_type == "relea" or
           (task_type == "prepr" and parent_task_type != "train" and parent_task_type != "vocab")):
            abort(flask.make_response(flask.jsonify(message="invalid parent task type: %s" %
                                      (parent_task_type)), 400))

    task_ids = []
    task_create = []

    while iterations > 0:
        if (chain_prepr_train and parent_task_type != "prepr") or task_type == "prepr":
            prepr_task_id, explicitname = build_task_id(content, xxyy, "prepr", parent_task_id)

            if explicitname:
                patch_config_explicitname(content, explicitname)

            idx = 0
            prepr_command = []
            train_command = content["docker"]["command"]
            while train_command[idx] != 'train' and train_command[idx] != 'preprocess':
                prepr_command.append(train_command[idx])
                idx += 1

            # create preprocess command, don't push the model on the catalog,
            # and generate a pseudo model
            prepr_command.append("--no_push")
            prepr_command.append("preprocess")
            prepr_command.append("--build_model")

            content["docker"]["command"] = prepr_command

            content["ncpus"] = ncpus or \
                get_cpu_count(current_configuration, 0, "preprocess")
            content["ngpus"] = 0

            preprocess_resource = service_module.select_resource_from_capacity(
                                            resource, Capacity(content["ngpus"], content["ncpus"]))

            # launch preprocess task on cpus only
            task_create.append(
                    (redis, taskfile_dir,
                     prepr_task_id, "prepr", parent_task_id, preprocess_resource, service,
                     _duplicate_adapt(service_module, content),
                     files, priority, 0, content["ncpus"], {}))
            task_ids.append("%s\t%s\tngpus: %d, ncpus: %d" % ("prepr", prepr_task_id, 0, content["ncpus"]))
            remove_config_option(train_command)
            change_parent_task(train_command, prepr_task_id)
            parent_task_id = prepr_task_id
            content["docker"]["command"] = train_command

        if task_type != "prepr":
            task_id, explicitname = build_task_id(content, xxyy, task_suffix, parent_task_id)

            if explicitname:
                patch_config_explicitname(content, explicitname)

            file_to_transtaskid = {}
            if task_type == "trans":
                try:
                    idx = content["docker"]["command"].index("trans")
                    output_files = get_params(("-o", "--output"), content["docker"]["command"][idx+1:])
                    for ofile in output_files:
                        file_to_transtaskid[ofile] = task_id
                except Exception:
                    pass

            content["ncpus"] = ncpus or \
                get_cpu_count(current_configuration, ngpus, task_type)
            content["ngpus"] = ngpus

            if task_type == "trans" and can_trans_as_release:
                if "--as_release" not in content["docker"]["command"] and trans_as_release:
                    content["docker"]["command"].append("--as_release")
                    content["ngpus"] = ngpus = 0

            task_resource = service_module.select_resource_from_capacity(
                                            resource, Capacity(content["ngpus"],
                                                               content["ncpus"]))

            task_create.append(
                    (redis, taskfile_dir,
                     task_id, task_type, parent_task_id, task_resource, service,
                     _duplicate_adapt(service_module, content),
                     files, priority,
                     content["ngpus"], content["ncpus"],
                     {}))
            task_ids.append("%s\t%s\tngpus: %d, ncpus: %d" % (
                        task_type, task_id,
                        content["ngpus"], content["ncpus"]))
            parent_task_type = task_type[:5]
            remove_config_option(content["docker"]["command"])

            if totranslate:
                content_translate = deepcopy(content)
                content_translate["priority"] = priority + 1
                if trans_as_release:
                    content_translate["ngpus"] = 0
                else:
                    content_translate["ngpus"] = min(ngpus, 1)

                content_translate["ncpus"] = ncpus or \
                    get_cpu_count(current_configuration,
                                  content_translate["ngpus"], "trans")

                translate_resource = service_module.select_resource_from_capacity(
                                                resource, Capacity(content_translate["ngpus"],
                                                                   content_translate["ncpus"]))

                if ngpus == 0 or trans_as_release:
                    file_per_gpu = len(totranslate)
                else:
                    file_per_gpu = (len(totranslate)+ngpus-1) / ngpus
                subset_idx = 0
                while subset_idx * file_per_gpu < len(totranslate):
                    content_translate["docker"]["command"] = ["trans"]
                    if trans_as_release:
                        content_translate["docker"]["command"].append("--as_release")
                    content_translate["docker"]["command"].append('-i')
                    subset_totranslate = totranslate[subset_idx*file_per_gpu:
                                                     (subset_idx+1)*file_per_gpu]
                    for f in subset_totranslate:
                        content_translate["docker"]["command"].append(f[0])

                    change_parent_task(content_translate["docker"]["command"], task_id)
                    trans_task_id, explicitname = build_task_id(content_translate, xxyy, "trans", task_id)

                    content_translate["docker"]["command"].append('-o')
                    for f in subset_totranslate:
                        ofile = f[1].replace('<MODEL>', task_id)
                        file_to_transtaskid[ofile] = trans_task_id
                        content_translate["docker"]["command"].append(ofile)

                    task_create.append(
                            (redis, taskfile_dir,
                             trans_task_id, "trans", task_id, translate_resource, service,
                             _duplicate_adapt(service_module, content_translate),
                             (), content_translate["priority"],
                             content_translate["ngpus"], content_translate["ncpus"],
                             {}))
                    task_ids.append("%s\t%s\tngpus: %d, ncpus: %d" % (
                                           "trans", trans_task_id,
                                           content_translate["ngpus"], content_translate["ncpus"]))
                    subset_idx += 1

            if toscore:
                toscore_parent = {}
                for (ofile, rfile) in toscore:
                    ofile = ofile.replace('<MODEL>', task_id)
                    parent_task_id = file_to_transtaskid.get(ofile)
                    if parent_task_id:
                        if parent_task_id not in toscore_parent:
                            toscore_parent[parent_task_id] = {"output": [], "ref": []}
                        ofile_split = ofile.split(':')
                        if len(ofile_split) == 2 and ofile_split[0] == 'launcher':
                            ofile = 'launcher:../' + parent_task_id + "/" + ofile_split[1]
                        toscore_parent[parent_task_id]["output"].append(ofile)
                        toscore_parent[parent_task_id]["ref"].append(rfile)
                for parent_task_id, oref in six.iteritems(toscore_parent):
                    content_score = deepcopy(content)
                    content_score["priority"] = priority + 1
                    content_score["ngpus"] = 0
                    content_score["ncpus"] = 1

                    score_resource = service_module.select_resource_from_capacity(resource, Capacity(0, 1))

                    image_score = "nmtwizard/score"

                    option_lang = []
                    if parent_struct is not None:
                        option_lang.append('-l')
                        option_lang.append(parent_struct['xxyy'][-2:])

                    content_score["docker"] = {
                        "image": image_score,
                        "registry": _get_registry(service_module, image_score),
                        "tag": "latest",
                        "command": ["score", "-o"] + oref["output"] + ["-r"] + oref["ref"] + option_lang + ['-f', "launcher:scores"]
                    }

                    score_task_id, explicitname = build_task_id(content_score, xxyy, "score", parent_task_id)
                    task_create.append(
                            (redis, taskfile_dir,
                             score_task_id, "exec", parent_task_id, score_resource, service,
                             content_score,
                             (), priority+2,
                             0, 1,
                             {}))
                    task_ids.append("%s\t%s\tngpus: %d, ncpus: %d" % (
                                           "score", score_task_id,
                                           0, 1))

            if totuminer:
                # tuminer can run in CPU only mode, but it will be very slow for large data
                ngpus_recommend = ngpus
                ncpus_recommend = ncpus

                totuminer_parent = {}
                for (ifile, ofile) in totuminer:
                    #ofile = ofile.replace('<MODEL>', task_id)
                    parent_task_id = file_to_transtaskid.get(ofile)
                    if parent_task_id:
                        if parent_task_id not in totuminer_parent:
                            totuminer_parent[parent_task_id] = {"infile": [], "outfile": [], "scorefile": []}
                        ofile_split = ofile.split(':')
                        if len(ofile_split) == 2 and ofile_split[0] == 'launcher':
                            ofile = 'launcher:../' + parent_task_id + "/" + ofile_split[1]
                        totuminer_parent[parent_task_id]["infile"].append(ifile)
                        totuminer_parent[parent_task_id]["outfile"].append(ofile)
                        scorefile = ofile
                        if scorefile.endswith(".gz"):
                            scorefile = scorefile[:-3]
                        totuminer_parent[parent_task_id]["scorefile"].append(scorefile[:-3])
                for parent_task_id, in_out in six.iteritems(totuminer_parent):
                    content_tuminer = deepcopy(content)
                    content_tuminer["priority"] = priority + 1
                    content_tuminer["ngpus"] = ngpus_recommend
                    content_tuminer["ncpus"] = ncpus_recommend

                    tuminer_resource = service_module.select_resource_from_capacity(resource, Capacity(ngpus_recommend, ncpus_recommend))

                    image_score = "nmtwizard/tuminer"

                    content_tuminer["docker"] = {
                        "image": image_score,
                        "registry": _get_registry(service_module, image_score),
                        "tag": "latest",
                        "command": ["tuminer", "--tumode", "score", "--srcfile"] + in_out["infile"] + ["--tgtfile"] + in_out["outfile"]+ ["--output"] + in_out["scorefile"]
                    }

                    tuminer_task_id, explicitname = build_task_id(content_tuminer, xxyy, "tuminer", parent_task_id)
                    task_create.append(
                            (redis, taskfile_dir,
                             tuminer_task_id, "exec", parent_task_id, tuminer_resource, service,
                             content_tuminer,
                             (), priority+2,
                             ngpus_recommend, ncpus_recommend,
                             {}))
                    task_ids.append("%s\t%s\tngpus: %d, ncpus: %d" % (
                                           "tuminer", tuminer_task_id,
                                           ngpus_recommend, ncpus_recommend))

        iterations -= 1
        if iterations > 0:
            parent_task_id = task_id
            change_parent_task(content["docker"]["command"], parent_task_id)

    (task_ids, task_create) = post_function('POST/task/launch', task_ids, task_create)

    for tc in task_create:
        task.create(*tc)

    if len(task_ids) == 1:
        task_ids = task_ids[0]

    return flask.jsonify(task_ids)
Beispiel #20
0
    def _service_unqueue(self, service):
        """find the best next task to push to the work queue
        """
        with self._redis.acquire_lock('service:' + service.name):
            queue = 'queued:%s' % service.name
            count = self._redis.llen(queue)
            idx = 0

            preallocated_task_count = {}
            preallocated_task_resource = {}
            avail_resource = {}
            resources = service.list_resources()
            reserved = {}

            # list free cpu/gpus on each node
            for resource in resources:
                current_xpu_usage = Capacity()
                capacity = resources[resource]
                keygr = 'gpu_resource:%s:%s' % (self._service, resource)
                keycr = 'cpu_resource:%s:%s' % (self._service, resource)
                key_reserved = 'reserved:%s:%s' % (service.name, resource)

                gpu_tasks = self._redis.hgetall(keygr)
                cpu_tasks = self._redis.hgetall(keycr)
                task_reserved = self._redis.get(key_reserved)

                # can not launch multiple tasks on service with no multi-tasking (ec2)
                if not service.resource_multitask and \
                   not task_reserved and \
                   (gpu_tasks or cpu_tasks):
                    continue

                for k, v in six.iteritems(gpu_tasks):
                    if v in preallocated_task_count:
                        preallocated_task_count[v].incr_ngpus(1)
                    else:
                        preallocated_task_count[v] = Capacity(ngpus=1)
                        preallocated_task_resource[v] = resource
                    current_xpu_usage.incr_ngpus(1)
                for k, v in six.iteritems(cpu_tasks):
                    if v in preallocated_task_count:
                        preallocated_task_count[v].incr_ncpus(1)
                    else:
                        preallocated_task_count[v] = Capacity(ncpus=1)
                        preallocated_task_resource[v] = resource
                    current_xpu_usage.incr_ncpus(1)
                available_xpus = capacity - current_xpu_usage
                avail_resource[resource] = available_xpus
                reserved[resource] = task_reserved
                self._logger.debug("\tresource %s - reserved: %s - free %s",
                                   resource, task_reserved or "False",
                                   available_xpus)

            if len(avail_resource) == 0:
                return

            # Go through the tasks, find if there are tasks that can be launched and
            # queue the best one
            best_task_id = None
            best_task_priority = -10000
            best_task_queued_time = 0
            while count > 0:
                count -= 1
                next_task_id = self._redis.lindex(queue, count)

                if next_task_id is not None:
                    next_keyt = 'task:%s' % next_task_id
                    # self._logger.debug("\tcheck task: %s", next_task_id)
                    parent = self._redis.hget(next_keyt, 'parent')
                    # check parent dependency
                    if parent:
                        keyp = 'task:%s' % parent
                        if self._redis.exists(keyp):
                            # if the parent task is in the database, check for dependencies
                            parent_status = self._redis.hget(keyp, 'status')
                            if parent_status != 'stopped':
                                if parent_status == 'running':
                                    # parent is still running so update queued time to be as close
                                    # as possible to terminate time of parent task
                                    self._redis.hset(next_keyt, "queued_time",
                                                     time.time())
                                continue
                            else:
                                if self._redis.hget(keyp,
                                                    'message') != 'completed':
                                    task.terminate(self._redis,
                                                   next_task_id,
                                                   phase='dependency_error')
                                    continue

                    nxpus = Capacity(self._redis.hget(next_keyt, 'ngpus'),
                                     self._redis.hget(next_keyt, 'ncpus'))

                    foundResource = False
                    if next_task_id in preallocated_task_count:
                        # if task is pre-allocated, can only continue on the same node
                        r = preallocated_task_resource[next_task_id]
                        nxpus -= preallocated_task_count[next_task_id]
                        avail_r = avail_resource[r]
                        foundResource = (nxpus.ngpus == 0 and avail_r.ncpus !=
                                         0) or (nxpus.ngpus != 0
                                                and avail_r.ngpus != 0)
                    else:
                        # can the task be launched on any node
                        for r, v in six.iteritems(avail_resource):
                            # cannot launch a new task on a reserved node
                            if reserved[r]:
                                continue
                            if ((nxpus.ngpus > 0
                                 and resources[r].ngpus >= nxpus.ngpus
                                 and v.ngpus > 0)
                                    or (nxpus.ngpus == 0 and v.ncpus >= 0)):
                                foundResource = True
                                break
                    if not foundResource:
                        continue

                    priority = int(self._redis.hget(next_keyt, 'priority'))
                    queued_time = float(
                        self._redis.hget(next_keyt, 'queued_time'))
                    if priority > best_task_priority or (
                            priority == best_task_priority
                            and best_task_queued_time > queued_time):
                        best_task_priority = priority
                        best_task_id = next_task_id
                        best_task_queued_time = queued_time

            if best_task_id:
                self._logger.info('selected %s to be launched on %s',
                                  best_task_id, service.name)
                task.work_queue(self._redis, best_task_id, service.name)
                self._redis.lrem(queue, 0, best_task_id)
Beispiel #21
0
    def _advance_task(self, task_id):
        """Tries to advance the task to the next status. If it can, re-queue it immediately
        to process the next stage. Otherwise, re-queue it after some delay to try again.
        """
        keyt = 'task:%s' % task_id
        with self._redis.acquire_lock(keyt, acquire_timeout=1,
                                      expire_time=600):
            status = self._redis.hget(keyt, 'status')
            if status == 'stopped':
                return

            service_name = self._redis.hget(keyt, 'service')
            if service_name not in self._services:
                raise ValueError('unknown service %s' % service_name)
            service = self._services[service_name]

            self._logger.info('%s: trying to advance from status %s', task_id,
                              status)

            if status == 'queued':
                resource = self._redis.hget(keyt, 'resource')
                parent = self._redis.hget(keyt, 'parent')
                if parent:
                    keyp = 'task:%s' % parent
                    # if the parent task is in the database, check for dependencies
                    if self._redis.exists(keyp):
                        status = self._redis.hget(keyp, 'status')
                        if status == 'stopped':
                            if self._redis.hget(keyp,
                                                'message') != 'completed':
                                task.terminate(self._redis,
                                               task_id,
                                               phase='dependency_error')
                                return
                        else:
                            self._logger.warning(
                                '%s: depending on other task, waiting',
                                task_id)
                            task.service_queue(self._redis, task_id,
                                               service.name)
                            return
                nxpus = Capacity(self._redis.hget(keyt, 'ngpus'),
                                 self._redis.hget(keyt, 'ncpus'))
                resource, available_xpus = self._allocate_resource(
                    task_id, resource, service, nxpus)
                if resource is not None:
                    self._logger.info('%s: resource %s reserved %s/%s',
                                      task_id, resource, available_xpus, nxpus)
                    self._redis.hset(keyt, 'alloc_resource', resource)
                    if nxpus == available_xpus:
                        task.set_status(self._redis, keyt, 'allocated')
                    else:
                        task.set_status(self._redis, keyt, 'allocating')
                    task.work_queue(self._redis, task_id, service_name)
                else:
                    self._logger.warning('%s: no resources available, waiting',
                                         task_id)
                    task.service_queue(self._redis, task_id, service.name)
            elif status == 'allocating':
                resource = self._redis.hget(keyt, 'alloc_resource')
                nxpus = Capacity(self._redis.hget(keyt, 'ngpus'),
                                 self._redis.hget(keyt, 'ncpus'))
                already_allocated_xpus = Capacity()
                keygr = 'gpu_resource:%s:%s' % (service.name, resource)
                for k, v in six.iteritems(self._redis.hgetall(keygr)):
                    if v == task_id:
                        already_allocated_xpus.incr_ngpus(1)
                keycr = 'cpu_resource:%s:%s' % (service.name, resource)
                for k, v in six.iteritems(self._redis.hgetall(keycr)):
                    if v == task_id:
                        already_allocated_xpus.incr_ncpus(1)
                capacity = service.list_resources()[resource]
                available_xpus, remaining_xpus = self._reserve_resource(
                    service, resource, capacity,
                    task_id, nxpus - already_allocated_xpus, Capacity(),
                    Capacity(-1, -1), True)
                self._logger.info(
                    'task: %s - resource: %s (capacity %s)- already %s - available %s',
                    task_id, resource, capacity, already_allocated_xpus,
                    available_xpus)
                if available_xpus and available_xpus == nxpus - already_allocated_xpus:
                    task.set_status(self._redis, keyt, 'allocated')
                    key_reserved = 'reserved:%s:%s' % (service.name, resource)
                    self._redis.delete(key_reserved)
                    task.work_queue(self._redis, task_id, service.name)
                else:
                    task.work_queue(self._redis,
                                    task_id,
                                    service.name,
                                    delay=20)
            elif status == 'allocated':
                content = json.loads(self._redis.hget(keyt, 'content'))
                resource = self._redis.hget(keyt, 'alloc_resource')
                self._logger.info('%s: launching on %s', task_id, service.name)
                try:
                    keygr = 'gpu_resource:%s:%s' % (service.name, resource)
                    lgpu = []
                    for k, v in six.iteritems(self._redis.hgetall(keygr)):
                        if v == task_id:
                            lgpu.append(k)
                    self._redis.hset(keyt, 'alloc_lgpu', ",".join(lgpu))
                    keycr = 'cpu_resource:%s:%s' % (service.name, resource)
                    lcpu = []
                    for k, v in six.iteritems(self._redis.hgetall(keycr)):
                        if v == task_id:
                            lcpu.append(k)
                    self._redis.hset(keyt, 'alloc_lcpu', ",".join(lcpu))
                    data = service.launch(
                        task_id, content['options'], (lgpu, lcpu), resource,
                        content['docker']['registry'],
                        content['docker']['image'], content['docker']['tag'],
                        content['docker']['command'],
                        task.file_list(self._redis, self._taskfile_dir,
                                       task_id), content['wait_after_launch'],
                        self._redis.hget(keyt, 'token'),
                        content.get('support_statistics'))
                except EnvironmentError as e:
                    # the resource is not available and will be set busy
                    self._block_resource(resource, service, str(e))
                    self._redis.hdel(keyt, 'alloc_resource')
                    # set the task as queued again
                    self._release_resource(
                        service, resource, task_id,
                        Capacity(self._redis.hget(keyt, 'ngpus'),
                                 self._redis.hget(keyt, 'ncpus')))
                    task.set_status(self._redis, keyt, 'queued')
                    task.service_queue(self._redis, task_id, service.name)
                    self._logger.info(
                        'could not launch [%s] %s on %s: blocking resource',
                        str(e), task_id, resource)
                    return
                except Exception as e:
                    # all other errors make the task fail
                    self._logger.info('fail task [%s] - %s', task_id, str(e))
                    task.append_log(self._redis, self._taskfile_dir, task_id,
                                    str(e))
                    task.terminate(self._redis, task_id, phase='launch_error')
                    return
                self._logger.info('%s: task started on %s', task_id,
                                  service.name)
                self._redis.hset(keyt, 'job', json.dumps(data))
                task.set_status(self._redis, keyt, 'running')
                # For services that do not notify their activity, we should
                # poll the task status more regularly.
                task.work_queue(self._redis,
                                task_id,
                                service.name,
                                delay=service.is_notifying_activity and 120
                                or 30)

            elif status == 'running':
                self._logger.debug('- checking activity of task: %s', task_id)
                data = json.loads(self._redis.hget(keyt, 'job'))
                try:
                    status = service.status(task_id, data)
                except Exception as e:
                    self._logger.info('cannot get status for [%s] - %s',
                                      task_id, str(e))
                    self._redis.hincrby(keyt, 'status_fail', 1)
                    if self._redis.hget(keyt, 'status_fail') > 4:
                        task.terminate(self._redis,
                                       task_id,
                                       phase='lost_connection')
                        return
                else:
                    self._redis.hdel(keyt, 'status_fail')
                if status == 'dead':
                    self._logger.info(
                        '%s: task no longer running on %s, request termination',
                        task_id, service.name)
                    task.terminate(self._redis, task_id, phase='exited')
                else:
                    task.work_queue(self._redis,
                                    task_id,
                                    service.name,
                                    delay=service.is_notifying_activity and 600
                                    or 120)

            elif status == 'terminating':
                data = self._redis.hget(keyt, 'job')
                nxpus = Capacity(self._redis.hget(keyt, 'ngpus'),
                                 self._redis.hget(keyt, 'ncpus'))
                if data is not None:
                    container_id = self._redis.hget(keyt, 'container_id')
                    data = json.loads(data)
                    data['container_id'] = container_id
                    self._logger.info('%s: terminating task (job: %s)',
                                      task_id, json.dumps(data))
                    try:
                        service.terminate(data)
                        self._logger.info('%s: terminated', task_id)
                    except Exception:
                        self._logger.warning('%s: failed to terminate',
                                             task_id)
                else:
                    self._logger.info('%s: terminating task (on error)',
                                      task_id)
                resource = self._redis.hget(keyt, 'alloc_resource')
                if resource:
                    self._release_resource(service, resource, task_id, nxpus)
                task.set_status(self._redis, keyt, 'stopped')
                task.disable(self._redis, task_id)
Beispiel #22
0
 def list_resources(self):
     return {
         _hostname(server): Capacity(len(server['gpus']),
                                     len(server['cpus']))
         for server in self._config['variables']['server_pool']
     }
Beispiel #23
0
from nmtwizard.capacity import Capacity

# extracted from https://www.ec2instances.info - might need some update
ec2_capacity_map = {
    "a1.2xlarge": Capacity(0, 8),
    "a1.xlarge": Capacity(0, 4),
    "a1.large": Capacity(0, 2),
    "a1.medium": Capacity(0, 1),
    "a1.4xlarge": Capacity(0, 16),
    "c1.xlarge": Capacity(0, 8),
    "c1.medium": Capacity(0, 2),
    "c3.2xlarge": Capacity(0, 8),
    "c3.8xlarge": Capacity(0, 32),
    "c3.xlarge": Capacity(0, 4),
    "c3.large": Capacity(0, 2),
    "c3.4xlarge": Capacity(0, 16),
    "c4.2xlarge": Capacity(0, 8),
    "c4.8xlarge": Capacity(0, 36),
    "c4.xlarge": Capacity(0, 4),
    "c4.large": Capacity(0, 2),
    "c4.4xlarge": Capacity(0, 16),
    "c5.18xlarge": Capacity(0, 72),
    "c5d.18xlarge": Capacity(0, 72),
    "c5.9xlarge": Capacity(0, 36),
    "c5d.9xlarge": Capacity(0, 36),
    "c5.2xlarge": Capacity(0, 8),
    "c5d.2xlarge": Capacity(0, 8),
    "c5.xlarge": Capacity(0, 4),
    "c5d.xlarge": Capacity(0, 4),
    "c5.large": Capacity(0, 2),
    "c5d.large": Capacity(0, 2),