Beispiel #1
0
 def _handle_running_task(self, task_id):
     keyt = 'task:%s' % task_id
     _, service = self._get_service(keyt=keyt)
     self._logger.debug('- checking activity of task: %s', task_id)
     data = json.loads(self._redis.hget(keyt, 'job'))
     try:
         status = service.status(task_id, data)
         if status == 'dead':
             self._logger.info(
                 '%s: task no longer running on %s, request termination',
                 task_id, service.name)
             task.terminate(self._redis, task_id, phase='exited')
         else:
             task.work_queue(self._redis,
                             task_id,
                             service.name,
                             delay=service.is_notifying_activity and 600
                             or 120)
     except Exception as e:
         self._logger.info('cannot get status for [%s] - %s', task_id,
                           str(e))
         self._redis.hincrby(keyt, 'status_fail', 1)
         if int(self._redis.hget(keyt, 'status_fail')) > 4:
             return task.terminate(self._redis,
                                   task_id,
                                   phase='lost_connection')
         self._redis.hdel(keyt, 'status_fail')
     return None
Beispiel #2
0
def terminate(task_id):
    with redis.acquire_lock(task_id):
        current_status = task.info(redis, task_id, "status")
        if current_status is None:
            flask.abort(flask.make_response(flask.jsonify(message="task %s unknown" % task_id), 404))
        elif current_status == "stopped":
            return flask.jsonify(message="%s already stopped" % task_id)
        phase = flask.request.args.get('phase')
        task.terminate(redis, task_id, phase=phase)
    return flask.jsonify(message="terminating %s" % task_id)
Beispiel #3
0
def terminate(task_id):
    with redis.acquire_lock(task_id):
        current_status = task.info(redis, taskfile_dir, task_id, "status")
        if current_status is None:
            abort(flask.make_response(flask.jsonify(message="task %s unknown" % task_id), 404))
        elif current_status == "stopped":
            return flask.jsonify(message="%s already stopped" % task_id)
        phase = flask.request.args.get('phase')

    res = post_function('GET/task/terminate', task_id, phase)
    if res:
        task.terminate(redis, task_id, phase="publish_error")
        return flask.jsonify(message="problem while posting model: %s" % res)

    task.terminate(redis, task_id, phase=phase)
    return flask.jsonify(message="terminating %s" % task_id)
Beispiel #4
0
            def try_create(next_task_id):
                next_keyt = 'task:%s' % next_task_id
                parent = self._redis.hget(next_keyt, 'parent')
                task_entity = task.get_owner_entity(self._redis, next_task_id)

                if task_entity not in resource_mgr.entities_usage:
                    self._logger.error(
                        "\t[Task %s] entity %s - without usage limit !",
                        next_task_id, task_entity)
                    return None

                # check parent dependency
                if parent:
                    keyp = 'task:%s' % parent
                    if self._redis.exists(keyp):
                        # if the parent task is in the database, check for dependencies
                        parent_status = self._redis.hget(keyp, 'status')
                        if parent_status != 'stopped':
                            if parent_status == 'running':
                                # parent is still running so update queued time to be as close
                                # as possible to terminate time of parent task
                                self._redis.hset(next_keyt, "queued_time",
                                                 time.time())
                            return None

                        if self._redis.hget(keyp, 'message') != 'completed':
                            task.terminate(self._redis,
                                           next_task_id,
                                           phase='dependency_error')
                            return None

                task_capacity = Capacity(self._redis.hget(next_keyt, 'ngpus'),
                                         self._redis.hget(next_keyt, 'ncpus'))
                candidate_task = CandidateTask(
                    next_task_id, task_entity, self._redis, task_capacity,
                    resource_mgr.entities_usage[task_entity], self._logger)
                # check now the task has a chance to be processed by any machine
                for _, machine in six.iteritems(resource_mgr._machines):
                    can_be_processed = machine._is_authorized(candidate_task._entity, candidate_task._capacity) \
                                       and candidate_task._capacity.inf_or_eq(machine._init_capacity)
                    if can_be_processed:
                        return candidate_task

                return None
Beispiel #5
0
    def run(self):
        self._logger.info('Starting worker')

        # Subscribe to beat expiration.
        pubsub = self._redis.pubsub()
        pubsub.psubscribe('__keyspace@0__:beat:*')
        pubsub.psubscribe('__keyspace@0__:queue:*')

        while True:
            message = pubsub.get_message()
            if message:
                channel = message['channel']
                data = message['data']
                if data == 'expired':
                    if channel.startswith('__keyspace@0__:beat:'):
                        task_id = channel[20:]
                        self._logger.info('%s: task expired', task_id)
                        with self._redis.acquire_lock(task_id):
                            task.terminate(self._redis,
                                           task_id,
                                           phase='expired')
                    elif channel.startswith('__keyspace@0__:queue:'):
                        task_id = channel[21:]
                        task.queue(self._redis, task_id)
            else:
                task_id = task.unqueue(self._redis)
                if task_id is not None:
                    try:
                        self._advance_task(task_id)
                    except RuntimeWarning:
                        self._logger.warning(
                            '%s: failed to acquire a lock, retrying', task_id)
                        task.queue(self._redis, task_id)
                    except Exception as e:
                        self._logger.error('%s: %s', task_id, str(e))
                        with self._redis.acquire_lock(task_id):
                            task.terminate(self._redis,
                                           task_id,
                                           phase="launch_error")
            time.sleep(0.1)
Beispiel #6
0
    def run(self):
        signal.signal(signal.SIGTERM, graceful_exit)
        signal.signal(signal.SIGINT, graceful_exit)

        pubsub = self._redis.pubsub()
        pubsub.psubscribe('__keyspace@0__:beat:*')
        pubsub.psubscribe('__keyspace@0__:queue:*')

        while True:
            message = pubsub.get_message()
            if message:
                channel = message['channel']
                data = message['data']
                if data == 'expired':
                    # task expired, not beat was received
                    if channel.startswith('__keyspace@0__:beat:'):
                        task_id = channel[20:]
                        service = self._redis.hget('task:' + task_id, 'service')
                        if service in self._services:
                            self._logger.info('%s: task expired', task_id)
                            auth_token = self._redis.hget('task:%s' % task_id, 'token')
                            callback_url = self._services[service]._config.get('callback_url')
                            if auth_token:
                                callback_url = callback_url.replace("://", "://" + auth_token + ":x@")
                            r = requests.get(os.path.join(callback_url, "task/terminate", task_id),
                                             params={'phase': 'expired'})
                            if r.status_code != 200:
                                self._logger.warning('incorrect result from \'task/terminate\' service: %s' % r.text)
                            with self._redis.acquire_lock(task_id):
                                task.terminate(self._redis, task_id, phase='expired')
                    # expired in the queue - comes back in the work queue
                    elif channel.startswith('__keyspace@0__:queue:'):
                        task_id = channel[21:]
                        service = self._redis.hget('task:' + task_id, 'service')
                        if service in self._services:
                            self._logger.info('%s: move to work queue', task_id)
                            task.work_queue(self._redis, task_id, service)

            time.sleep(self._work_cycle)
Beispiel #7
0
    def run(self):
        signal.signal(signal.SIGTERM, graceful_exit)
        signal.signal(signal.SIGINT, graceful_exit)
        self._logger.info('Starting...')

        counter = 0

        while True:
            # process one element from work queue
            task_id = task.work_unqueue(self._redis, self._service)
            if task_id is not None:
                try:
                    self._advance_task(task_id)
                except RuntimeWarning:
                    self._logger.warning(
                        '%s: failed to acquire a lock, retrying', task_id)
                    task.work_queue(self._redis, task_id, self._service)
                except Exception as e:
                    self._logger.error('%s: %s', task_id, str(e))
                    with self._redis.acquire_lock(task_id):
                        task.set_log(self._redis, self._taskfile_dir, task_id,
                                     str(e))
                        task.terminate(self._redis,
                                       task_id,
                                       phase="launch_error")
                    self._logger.info(traceback.format_exc())
            # every 0.01s * refresh_counter - check if we can find some free resource
            if counter > self._refresh_counter:
                # if there are some queued tasks, look for free resources
                if self._redis.exists('queued:%s' % self._service):
                    self._logger.debug('checking processes on : %s',
                                       self._service)
                    self._select_best_task_to_process(
                        self._services[self._service])
                counter = 0

            counter += 1
            time.sleep(self._work_cycle)
    def run(self):
        signal.signal(signal.SIGTERM, graceful_exit)
        signal.signal(signal.SIGINT, graceful_exit)

        pubsub = self._redis.pubsub()
        pubsub.psubscribe('__keyspace@0__:beat:*')
        pubsub.psubscribe('__keyspace@0__:queue:*')

        while True:
            message = pubsub.get_message()
            if message:
                channel = message['channel']
                data = message['data']
                if data == 'expired':
                    # task expired, not beat was received
                    if channel.startswith('__keyspace@0__:beat:'):
                        task_id = channel[20:]
                        service = self._redis.hget('task:' + task_id,
                                                   'service')
                        if service in self._services:
                            self._logger.info('%s: task expired', task_id)
                            with self._redis.acquire_lock(task_id):
                                task.terminate(self._redis,
                                               task_id,
                                               phase='expired')
                    # expired in the queue - comes back in the work queue
                    elif channel.startswith('__keyspace@0__:queue:'):
                        task_id = channel[21:]
                        service = self._redis.hget('task:' + task_id,
                                                   'service')
                        if service in self._services:
                            self._logger.info('%s: move to work queue',
                                              task_id)
                            task.work_queue(self._redis, task_id, service)

            time.sleep(self._work_cycle)
Beispiel #9
0
    def _advance_task(self, task_id):
        """Tries to advance the task to the next status. If it can, re-queue it immediately
        to process the next stage. Otherwise, re-queue it after some delay to try again.
        """
        keyt = 'task:%s' % task_id
        with self._redis.acquire_lock(keyt, acquire_timeout=1, expire_time=600):
            status = self._redis.hget(keyt, 'status')
            if status == 'stopped':
                return

            service_name = self._redis.hget(keyt, 'service')
            if service_name not in self._services:
                raise ValueError('unknown service %s' % service_name)
            service = self._services[service_name]

            self._logger.info('%s: trying to advance from status %s', task_id, status)

            if status == 'queued':
                resource = self._redis.hget(keyt, 'resource')
                parent = self._redis.hget(keyt, 'parent')
                if parent:
                    keyp = 'task:%s' % parent
                    # if the parent task is in the database, check for dependencies
                    if self._redis.exists(keyp):
                        status = self._redis.hget(keyp, 'status')
                        if status == 'stopped':
                            if self._redis.hget(keyp, 'message') != 'completed':
                                task.terminate(self._redis, task_id, phase='dependency_error')
                                return
                        else:
                            self._logger.warning('%s: depending on other task, waiting', task_id)
                            task.service_queue(self._redis, task_id, service.name)
                            return
                ngpus = int(self._redis.hget(keyt, 'ngpus'))
                resource, available_gpus = self._allocate_resource(task_id, resource, service, ngpus)
                if resource is not None:
                    self._logger.info('%s: resource %s reserved (%d/%d)',
                                      task_id, resource, available_gpus, ngpus)
                    self._redis.hset(keyt, 'alloc_resource', resource)
                    if ngpus == available_gpus:
                        task.set_status(self._redis, keyt, 'allocated')
                    else:
                        task.set_status(self._redis, keyt, 'allocating')
                    task.work_queue(self._redis, task_id, service_name)
                else:
                    self._logger.warning('%s: no resources available, waiting', task_id)
                    task.service_queue(self._redis, task_id, service.name)
            elif status == 'allocating':
                resource = self._redis.hget(keyt, 'alloc_resource')
                keyr = 'resource:%s:%s' % (service.name, resource)
                ngpus = int(self._redis.hget(keyt, 'ngpus'))
                already_allocated_gpus = 0
                for k, v in six.iteritems(self._redis.hgetall(keyr)):
                    if v == task_id:
                        already_allocated_gpus += 1
                capacity = service.list_resources()[resource]
                available_gpus, remaining_gpus = self._reserve_resource(service, resource,
                                                                        capacity, task_id,
                                                                        ngpus - already_allocated_gpus,
                                                                        0, -1, True)
                self._logger.warning('task: %s - resource: %s (capacity %d)- already %d - available %d', task_id, resource, capacity, already_allocated_gpus, available_gpus)
                if available_gpus == ngpus - already_allocated_gpus:
                    task.set_status(self._redis, keyt, 'allocated')
                    key_reserved = 'reserved:%s:%s' % (service.name, resource)
                    self._redis.delete(key_reserved)
                    task.work_queue(self._redis, task_id, service.name)
                else:
                    task.work_queue(self._redis, task_id, service.name,
                                    delay=service.is_notifying_activity and 120 or 30)
            elif status == 'allocated':
                content = json.loads(self._redis.hget(keyt, 'content'))
                resource = self._redis.hget(keyt, 'alloc_resource')
                self._logger.info('%s: launching on %s', task_id, service.name)
                try:
                    keyr = 'resource:%s:%s' % (service.name, resource)
                    lgpu = []
                    for k, v in six.iteritems(self._redis.hgetall(keyr)):
                        if v == task_id:
                            lgpu.append(k)
                    self._redis.hset(keyt, 'alloc_lgpu', ",".join(lgpu))
                    data = service.launch(
                        task_id,
                        content['options'],
                        lgpu,
                        resource,
                        content['docker']['registry'],
                        content['docker']['image'],
                        content['docker']['tag'],
                        content['docker']['command'],
                        task.file_list(self._redis, task_id),
                        content['wait_after_launch'])
                except EnvironmentError as e:
                    # the resource is not available and will be set busy
                    self._block_resource(resource, service, str(e))
                    # set the task as queued again
                    self._redis.hdel(keyt, 'alloc_resource')
                    self._release_resource(service, resource, task_id)
                    task.set_status(self._redis, keyt, 'queued')
                    task.service_queue(self._redis, task_id, service.name)
                    self._logger.info('could not launch [%s] %s on %s: blocking resource', str(e), task_id, resource)
                    return
                except Exception as e:
                    # all other errors make the task fail
                    task.append_log(self._redis, task_id, str(e))
                    task.terminate(self._redis, task_id, phase='launch_error')
                    return
                self._logger.info('%s: task started on %s', task_id, service.name)
                self._redis.hset(keyt, 'job', json.dumps(data))
                task.set_status(self._redis, keyt, 'running')
                # For services that do not notify their activity, we should
                # poll the task status more regularly.
                task.work_queue(self._redis, task_id, service.name,
                                delay=service.is_notifying_activity and 120 or 30)

            elif status == 'running':
                self._logger.debug('- checking activity of task: %s', task_id)
                data = json.loads(self._redis.hget(keyt, 'job'))
                status = service.status(task_id, data)
                if status == 'dead':
                    self._logger.info('%s: task no longer running on %s, request termination',
                                      task_id, service.name)
                    task.terminate(self._redis, task_id, phase='exited')
                else:
                    task.work_queue(self._redis, task_id, service.name,
                                    delay=service.is_notifying_activity and 120 or 30)

            elif status == 'terminating':
                data = self._redis.hget(keyt, 'job')
                if data is not None:
                    container_id = self._redis.hget(keyt, 'container_id')
                    data = json.loads(data)
                    data['container_id'] = container_id
                    self._logger.info('%s: terminating task (%s)', task_id, json.dumps(data))
                    try:
                        service.terminate(data)
                        self._logger.info('%s: terminated', task_id)
                    except Exception:
                        self._logger.warning('%s: failed to terminate', task_id)
                resource = self._redis.hget(keyt, 'alloc_resource')
                self._release_resource(service, resource, task_id)
                task.set_status(self._redis, keyt, 'stopped')
                task.disable(self._redis, task_id)
Beispiel #10
0
    def run(self):
        self._logger.info('Starting worker')

        # Subscribe to beat expiration.
        pubsub = self._redis.pubsub()
        pubsub.psubscribe('__keyspace@0__:beat:*')
        pubsub.psubscribe('__keyspace@0__:queue:*')
        counter = 0

        while True:
            message = pubsub.get_message()
            if message:
                channel = message['channel']
                data = message['data']
                if data == 'expired':
                    self._logger.warning('received expired event on channel %s', channel)
                    if channel.startswith('__keyspace@0__:beat:'):
                        task_id = channel[20:]
                        service = self._redis.hget('task:'+task_id, 'service')
                        if service in self._services:
                            self._logger.info('%s: task expired', task_id)
                            with self._redis.acquire_lock(task_id):
                                task.terminate(self._redis, task_id, phase='expired')
                    elif channel.startswith('__keyspace@0__:queue:'):
                        task_id = channel[21:]
                        service = self._redis.hget('task:'+task_id, 'service')
                        if service in self._services:
                            task.work_queue(self._redis, task_id, service)
            else:
                for service in self._services:
                    task_id = task.work_unqueue(self._redis, service)
                    if task_id is not None:
                        try:
                            self._advance_task(task_id)
                        except RuntimeWarning:
                            self._logger.warning(
                                '%s: failed to acquire a lock, retrying', task_id)
                            task.work_queue(self._redis, task_id, service)
                        except Exception as e:
                            self._logger.error('%s: %s', task_id, str(e))
                            with self._redis.acquire_lock(task_id):
                                task.set_log(self._redis, task_id, str(e))
                                task.terminate(self._redis, task_id, phase="launch_error")
                    else:
                        if counter > self._refresh_counter:
                            resources = self._services[service].list_resources()
                            for resource in resources:                                    
                                keyr = 'resource:%s:%s' % (service, resource)
                                key_busy = 'busy:%s:%s' % (service, resource)
                                key_reserved = 'reserved:%s:%s' % (service, resource)
                                if not self._redis.exists(key_busy) and self._redis.hlen(keyr) < resources[resource]:
                                    if self._redis.exists(key_reserved) and self._redis.ttl('queue:'+self._redis.get(key_reserved))>10:
                                        self._redis.expire('queue:'+self._redis.get(key_reserved), 5)
                                        break
                            if self._redis.exists('queued:%s' % service):
                                resources = self._services[service].list_resources()
                                self._logger.debug('checking processes on : %s', service)
                                availableResource = False
                                for resource in resources:                                    
                                    keyr = 'resource:%s:%s' % (service, resource)
                                    key_busy = 'busy:%s:%s' % (service, resource)
                                    key_reserved = 'reserved:%s:%s' % (service, resource)
                                    if not self._redis.exists(key_busy) and self._redis.hlen(keyr) < resources[resource]:
                                        if not self._redis.exists(key_reserved):
                                            availableResource = True
                                        break
                                if availableResource:
                                    self._logger.debug('resources available on %s - trying dequeuing', service)
                                    self._service_unqueue(self._services[service])
                if counter > self._refresh_counter:
                    counter = 0
            counter += 1
            time.sleep(0.01)
Beispiel #11
0
 def _handle_allocated_task(self, task_id):
     keyt = 'task:%s' % task_id
     _, service = self._get_service(keyt=keyt)
     content = json.loads(self._redis.hget(keyt, 'content'))
     resource = self._redis.hget(keyt, 'alloc_resource')
     self._logger.info('%s: launching on %s', task_id, service.name)
     try:
         entity_config = self._get_current_config(task_id)
         keygr = 'gpu_resource:%s:%s' % (service.name, resource)
         lgpu = []
         for k, v in six.iteritems(self._redis.hgetall(keygr)):
             if v == task_id:
                 lgpu.append(k)
         self._redis.hset(keyt, 'alloc_lgpu', ",".join(lgpu))
         keycr = 'cpu_resource:%s:%s' % (service.name, resource)
         lcpu = []
         for k, v in six.iteritems(self._redis.hgetall(keycr)):
             if v == task_id:
                 lcpu.append(k)
         self._redis.hset(keyt, 'alloc_lcpu', ",".join(lcpu))
         data = service.launch(
             task_id, content['options'], (lgpu, lcpu), resource,
             entity_config["storages"], entity_config["docker"],
             content['docker']['registry'], content['docker']['image'],
             content['docker']['tag'], content['docker']['command'],
             task.file_list(self._taskfile_dir,
                            task_id), content['wait_after_launch'],
             self._redis.hget(keyt, 'token'),
             content.get('support_statistics'))
     except EnvironmentError as e:
         # the resource is not available and will be set busy
         self._block_resource(resource, service, str(e))
         self._redis.hdel(keyt, 'alloc_resource')
         # set the task as queued again
         self._release_resource(
             service, resource, task_id,
             Capacity(self._redis.hget(keyt, 'ngpus'),
                      self._redis.hget(keyt, 'ncpus')))
         status = self._redis.hget(keyt, 'status')
         if status == 'terminating':
             return None
         task.set_status(self._redis, keyt, 'queued')
         task.service_queue(self._redis, task_id, service.name)
         self._logger.info(
             'could not launch [%s] %s on %s: blocking resource', str(e),
             task_id, resource)
         self._logger.info(traceback.format_exc())
         return None
     except Exception as e:
         # all other errors make the task fail
         self._logger.info('fail task [%s] - %s', task_id, str(e))
         self._logger.info(traceback.format_exc())
         task.append_log(self._taskfile_dir, task_id, str(e))
         auth_token = self._redis.hget(keyt, 'token')
         callback_url = service._config.get('callback_url')
         if auth_token:
             callback_url = callback_url.replace("://",
                                                 "://" + auth_token + ":x@")
         r = requests.get(os.path.join(callback_url, "task/terminate",
                                       task_id),
                          params={'phase': 'launch_error'})
         if r.status_code != 200:
             raise RuntimeError(
                 'incorrect result from \'task/terminate\' service: %s' %
                 r.text) from e
         task.terminate(self._redis, task_id, phase='launch_error')
         self._logger.info(traceback.format_exc())
         return None
     self._logger.info('%s: task started on %s', task_id, service.name)
     self._redis.hset(keyt, 'job', json.dumps(data))
     status = self._redis.hget(keyt, 'status')
     if status == 'terminating':
         return None
     task.set_status(self._redis, keyt, 'running')
     # For services that do not notify their activity, we should
     # poll the task status more regularly.
     task.work_queue(self._redis,
                     task_id,
                     service.name,
                     delay=service.is_notifying_activity and 120 or 30)
     return None
Beispiel #12
0
    def _service_unqueue(self, service):
        """find the best next task to push to the work queue
        """
        with self._redis.acquire_lock('service:' + service.name):
            queue = 'queued:%s' % service.name
            count = self._redis.llen(queue)
            idx = 0

            preallocated_task_count = {}
            preallocated_task_resource = {}
            avail_resource = {}
            resources = service.list_resources()
            reserved = {}

            # list free cpu/gpus on each node
            for resource in resources:
                current_xpu_usage = Capacity()
                capacity = resources[resource]
                keygr = 'gpu_resource:%s:%s' % (self._service, resource)
                keycr = 'cpu_resource:%s:%s' % (self._service, resource)
                key_reserved = 'reserved:%s:%s' % (service.name, resource)

                gpu_tasks = self._redis.hgetall(keygr)
                cpu_tasks = self._redis.hgetall(keycr)
                task_reserved = self._redis.get(key_reserved)

                # can not launch multiple tasks on service with no multi-tasking (ec2)
                if not service.resource_multitask and \
                   not task_reserved and \
                   (gpu_tasks or cpu_tasks):
                    continue

                for k, v in six.iteritems(gpu_tasks):
                    if v in preallocated_task_count:
                        preallocated_task_count[v].incr_ngpus(1)
                    else:
                        preallocated_task_count[v] = Capacity(ngpus=1)
                        preallocated_task_resource[v] = resource
                    current_xpu_usage.incr_ngpus(1)
                for k, v in six.iteritems(cpu_tasks):
                    if v in preallocated_task_count:
                        preallocated_task_count[v].incr_ncpus(1)
                    else:
                        preallocated_task_count[v] = Capacity(ncpus=1)
                        preallocated_task_resource[v] = resource
                    current_xpu_usage.incr_ncpus(1)
                available_xpus = capacity - current_xpu_usage
                avail_resource[resource] = available_xpus
                reserved[resource] = task_reserved
                self._logger.debug("\tresource %s - reserved: %s - free %s",
                                   resource, task_reserved or "False",
                                   available_xpus)

            if len(avail_resource) == 0:
                return

            # Go through the tasks, find if there are tasks that can be launched and
            # queue the best one
            best_task_id = None
            best_task_priority = -10000
            best_task_queued_time = 0
            while count > 0:
                count -= 1
                next_task_id = self._redis.lindex(queue, count)

                if next_task_id is not None:
                    next_keyt = 'task:%s' % next_task_id
                    # self._logger.debug("\tcheck task: %s", next_task_id)
                    parent = self._redis.hget(next_keyt, 'parent')
                    # check parent dependency
                    if parent:
                        keyp = 'task:%s' % parent
                        if self._redis.exists(keyp):
                            # if the parent task is in the database, check for dependencies
                            parent_status = self._redis.hget(keyp, 'status')
                            if parent_status != 'stopped':
                                if parent_status == 'running':
                                    # parent is still running so update queued time to be as close
                                    # as possible to terminate time of parent task
                                    self._redis.hset(next_keyt, "queued_time",
                                                     time.time())
                                continue
                            else:
                                if self._redis.hget(keyp,
                                                    'message') != 'completed':
                                    task.terminate(self._redis,
                                                   next_task_id,
                                                   phase='dependency_error')
                                    continue

                    nxpus = Capacity(self._redis.hget(next_keyt, 'ngpus'),
                                     self._redis.hget(next_keyt, 'ncpus'))

                    foundResource = False
                    if next_task_id in preallocated_task_count:
                        # if task is pre-allocated, can only continue on the same node
                        r = preallocated_task_resource[next_task_id]
                        nxpus -= preallocated_task_count[next_task_id]
                        avail_r = avail_resource[r]
                        foundResource = (nxpus.ngpus == 0 and avail_r.ncpus !=
                                         0) or (nxpus.ngpus != 0
                                                and avail_r.ngpus != 0)
                    else:
                        # can the task be launched on any node
                        for r, v in six.iteritems(avail_resource):
                            # cannot launch a new task on a reserved node
                            if reserved[r]:
                                continue
                            if ((nxpus.ngpus > 0
                                 and resources[r].ngpus >= nxpus.ngpus
                                 and v.ngpus > 0)
                                    or (nxpus.ngpus == 0 and v.ncpus >= 0)):
                                foundResource = True
                                break
                    if not foundResource:
                        continue

                    priority = int(self._redis.hget(next_keyt, 'priority'))
                    queued_time = float(
                        self._redis.hget(next_keyt, 'queued_time'))
                    if priority > best_task_priority or (
                            priority == best_task_priority
                            and best_task_queued_time > queued_time):
                        best_task_priority = priority
                        best_task_id = next_task_id
                        best_task_queued_time = queued_time

            if best_task_id:
                self._logger.info('selected %s to be launched on %s',
                                  best_task_id, service.name)
                task.work_queue(self._redis, best_task_id, service.name)
                self._redis.lrem(queue, 0, best_task_id)
Beispiel #13
0
    def run(self):
        self._logger.info('Starting worker')

        # Subscribe to beat expiration.
        pubsub = self._redis.pubsub()
        pubsub.psubscribe('__keyspace@0__:beat:*')
        pubsub.psubscribe('__keyspace@0__:queue:*')
        counter = 0
        counter_beat = 1000

        while True:
            counter_beat += 1
            # every 1000 * 0.01s (10s) - check & reset beat of the worker
            if counter_beat > 1000:
                counter_beat = 0
                if self._redis.exists(self._worker_id):
                    self._redis.hset(self._worker_id, "beat_time", time.time())
                    self._redis.expire(self._worker_id, 1200)
                else:
                    self._logger.info('stopped by key expiration/removal')
                    sys.exit(0)

            # every 100 * 0.01s (1s) - check worker administration command
            if counter_beat % 100 == 0:
                workeradmin.process(self._logger, self._redis, self._service)
                if (self._default_config_timestamp
                        and self._redis.hget('default', 'timestamp') !=
                        self._default_config_timestamp):
                    self._logger.info(
                        'stopped by default configuration change')
                    sys.exit(0)

            # process one message from the queue
            message = pubsub.get_message()
            if message:
                channel = message['channel']
                data = message['data']
                if data == 'expired':
                    # task expired, not beat was received
                    if channel.startswith('__keyspace@0__:beat:'):
                        task_id = channel[20:]
                        service = self._redis.hget('task:' + task_id,
                                                   'service')
                        if service in self._services:
                            self._logger.info('%s: task expired', task_id)
                            with self._redis.acquire_lock(task_id):
                                task.terminate(self._redis,
                                               task_id,
                                               phase='expired')
                    # expired in the queue - comes back in the work queue
                    elif channel.startswith('__keyspace@0__:queue:'):
                        task_id = channel[21:]
                        service = self._redis.hget('task:' + task_id,
                                                   'service')
                        if service in self._services:
                            self._logger.info('%s: move to work queue',
                                              task_id)
                            task.work_queue(self._redis, task_id, service)

            # process one element from work queue
            task_id = task.work_unqueue(self._redis, self._service)
            if task_id is not None:
                try:
                    self._advance_task(task_id)
                except RuntimeWarning:
                    self._logger.warning(
                        '%s: failed to acquire a lock, retrying', task_id)
                    task.work_queue(self._redis, task_id, self._service)
                except Exception as e:
                    self._logger.error('%s: %s', task_id, str(e))
                    with self._redis.acquire_lock(task_id):
                        task.set_log(self._redis, self._taskfile_dir, task_id,
                                     str(e))
                        task.terminate(self._redis,
                                       task_id,
                                       phase="launch_error")

            # every 0.01s * refresh_counter - check if we can find some free resource
            if counter > self._refresh_counter:
                # if there are some queued tasks, look for free resources
                if self._redis.exists('queued:%s' % self._service):
                    self._logger.debug('checking processes on : %s',
                                       self._service)
                    self._service_unqueue(self._services[self._service])
                counter = 0

            counter += 1
            time.sleep(0.01)
Beispiel #14
0
    def _service_unqueue(self, service):
        """find the best next task to push to the work queue
        """
        with self._redis.acquire_lock('service:' + service.name):
            queue = 'queued:%s' % service.name
            count = self._redis.llen(queue)
            idx = 0

            preallocated_task_count = {}
            preallocated_task_resource = {}
            avail_resource = {}
            resources = service.list_resources()
            reserved = {}

            # list free cpu/gpus on each node
            for resource in resources:
                keyr = 'gpu_resource:%s:%s' % (self._service, resource)
                keyc = 'ncpus:%s:%s' % (self._service, resource)
                available_cpus = int(self._redis.get(keyc))
                current_gpu_usage = 0
                gpu_capacity = resources[resource]
                for k, v in six.iteritems(self._redis.hgetall(keyr)):
                    if v in preallocated_task_count:
                        preallocated_task_count[v] += 1
                    else:
                        preallocated_task_count[v] = 1
                        preallocated_task_resource[v] = resource
                    current_gpu_usage += 1
                available_gpus = gpu_capacity - current_gpu_usage
                avail_resource[resource] = (available_cpus, available_gpus)
                key_reserved = 'reserved:%s:%s' % (service.name, resource)
                reserved[resource] = self._redis.get(key_reserved)
                self._logger.debug(
                    "\tresource %s - reserved: %s - free gpus: %d, cpus: %d",
                    resource, reserved[resource] or "False", available_gpus,
                    available_cpus)

            # Go through the task, find if there are tasks that can be launched and
            # queue the best one
            best_task_id = None
            best_task_priority = -10000
            best_task_queued_time = 0
            while count > 0:
                count -= 1
                next_task_id = self._redis.lindex(queue, count)

                if next_task_id is not None:
                    next_keyt = 'task:%s' % next_task_id
                    # self._logger.debug("\tcheck task: %s", next_task_id)
                    parent = self._redis.hget(next_keyt, 'parent')
                    # check parent dependency
                    if parent:
                        keyp = 'task:%s' % parent
                        if self._redis.exists(keyp):
                            # if the parent task is in the database, check for dependencies
                            parent_status = self._redis.hget(keyp, 'status')
                            if parent_status != 'stopped':
                                if parent_status == 'running':
                                    # parent is still running so update queued time to be as close
                                    # as possible to terminate time of parent task
                                    self._redis.hset(next_keyt, "queued_time",
                                                     time.time())
                                continue
                            else:
                                if self._redis.hget(keyp,
                                                    'message') != 'completed':
                                    task.terminate(self._redis,
                                                   next_task_id,
                                                   phase='dependency_error')
                                    continue

                    ngpus = int(self._redis.hget(next_keyt, 'ngpus'))
                    ncpus = int(self._redis.hget(next_keyt, 'ncpus'))

                    foundResource = False
                    if next_task_id in preallocated_task_count:
                        # if task is pre-allocated, can only continue on the same node
                        r = preallocated_task_resource[next_task_id]
                        ngpus -= preallocated_task_count[next_task_id]
                        avail_r = avail_resource[r]
                        foundResource = (ngpus == 0 or avail_r[1] != 0) and (
                            ngpus != 0 or ncpus <= avail_r[1])
                    else:
                        # can the task be launched on any node
                        for r, v in six.iteritems(avail_resource):
                            # cannot launch a new task on a reserved node
                            if reserved[r]:
                                continue
                            if ((ngpus > 0 and resources[r] >= ngpus
                                 and v[1] > 0)
                                    or (ngpus == 0 and v[0] >= ncpus)):
                                foundResource = True
                                break
                    if not foundResource:
                        continue

                    priority = int(self._redis.hget(next_keyt, 'priority'))
                    queued_time = float(
                        self._redis.hget(next_keyt, 'queued_time'))
                    if priority > best_task_priority or (
                            priority == best_task_priority
                            and best_task_queued_time > queued_time):
                        best_task_priority = priority
                        best_task_id = next_task_id
                        best_task_queued_time = queued_time

            if best_task_id:
                self._logger.info('selected %s to be launched on %s',
                                  best_task_id, service.name)
                task.work_queue(self._redis, best_task_id, service.name)
                self._redis.lrem(queue, 0, best_task_id)
Beispiel #15
0
    def _advance_task(self, task_id):
        """Tries to advance the task to the next status. If it can, re-queue it immediately
        to process the next stage. Otherwise, re-queue it after some delay to try again.
        """
        keyt = 'task:%s' % task_id
        with self._redis.acquire_lock(keyt, acquire_timeout=1,
                                      expire_time=600):
            status = self._redis.hget(keyt, 'status')
            if status == 'stopped':
                return

            service_name = self._redis.hget(keyt, 'service')
            if service_name not in self._services:
                raise ValueError('unknown service %s' % service_name)
            service = self._services[service_name]

            self._logger.info('%s: trying to advance from status %s', task_id,
                              status)

            if status == 'queued':
                resource = self._redis.hget(keyt, 'resource')
                resource = self._allocate_resource(task_id, resource, service)
                if resource is not None:
                    self._logger.info('%s: resource %s reserved', task_id,
                                      resource)
                    self._redis.hset(keyt, 'resource', resource)
                    task.set_status(self._redis, keyt, 'allocated')
                    task.queue(self._redis, task_id)
                else:
                    self._logger.warning('%s: no resources available, waiting',
                                         task_id)
                    self._wait_for_resource(service, task_id)

            elif status == 'allocated':
                content = json.loads(self._redis.hget(keyt, 'content'))
                resource = self._redis.hget(keyt, 'resource')
                self._logger.info('%s: launching on %s', task_id, service.name)
                data = service.launch(task_id, content['options'], resource,
                                      content['docker']['registry'],
                                      content['docker']['image'],
                                      content['docker']['tag'],
                                      content['docker']['command'],
                                      task.file_list(self._redis, task_id),
                                      content['wait_after_launch'])
                self._logger.info('%s: task started on %s', task_id,
                                  service.name)
                self._redis.hset(keyt, 'job', json.dumps(data))
                task.set_status(self._redis, keyt, 'running')
                # For services that do not notify their activity, we should
                # poll the task status more regularly.
                task.queue(self._redis,
                           task_id,
                           delay=service.is_notifying_activity and 120 or 30)

            elif status == 'running':
                data = json.loads(self._redis.hget(keyt, 'job'))
                status = service.status(data)
                if status == 'dead':
                    self._logger.info(
                        '%s: task no longer running on %s, request termination',
                        task_id, service.name)
                    task.terminate(self._redis, task_id, phase='exited')
                else:
                    task.queue(self._redis,
                               task_id,
                               delay=service.is_notifying_activity and 120
                               or 30)

            elif status == 'terminating':
                data = self._redis.hget(keyt, 'job')
                if data is not None:
                    data = json.loads(data)
                    self._logger.info('%s: terminating task', task_id)
                    try:
                        service.terminate(data)
                        self._logger.info('%s: terminated', task_id)
                    except Exception:
                        self._logger.warning('%s: failed to terminate',
                                             task_id)
                resource = self._redis.hget(keyt, 'resource')
                self._release_resource(service, resource, task_id)
                task.set_status(self._redis, keyt, 'stopped')
                task.disable(self._redis, task_id)