Ejemplo n.º 1
0
    def _update_node_state(self, host_conf: DockerHostConfig, node_stats: NodeStats, get_usage_stats: bool):
        node_stats.labels = host_conf.labels
        try:
            my_engine = DockerClient(host_conf)
        except ZoeException as e:
            log.error(str(e))
            node_stats.status = 'offline'
            log.info('Node {} is offline'.format(host_conf.name))
            return
        else:
            node_stats.status = 'online'

        try:
            container_list = my_engine.list(only_label={'zoe_deployment_name': get_conf().deployment_name})
            info = my_engine.info()
        except ZoeException:
            return

        node_stats.container_count = len(container_list)
        node_stats.cores_total = info['NCPU']
        node_stats.memory_total = info['MemTotal']
        if info['Labels'] is not None:
            node_stats.labels += set(info['Labels'])

        node_stats.memory_reserved = sum([cont['memory_soft_limit'] for cont in container_list if cont['memory_soft_limit'] != node_stats.memory_total])
        node_stats.cores_reserved = sum([cont['cpu_quota'] / cont['cpu_period'] for cont in container_list if cont['cpu_period'] != 0])

        stats = {}
        for cont in container_list:
            stats[cont['id']] = {}
            stats[cont['id']]['core_limit'] = cont['cpu_quota'] / cont['cpu_period']
            stats[cont['id']]['mem_limit'] = cont['memory_soft_limit']
        node_stats.service_stats = stats

        if get_usage_stats:
            if get_conf().kairosdb_enable:
                kdb = KairosDBInMetrics()
                for cont in container_list:
                    stats[cont['id']].update(kdb.get_service_usage(cont['name']))

                node_stats.memory_in_use = sum([stat['mem_usage'] for stat in stats.values()])
                node_stats.cores_in_use = sum([stat['cpu_usage'] for stat in stats.values()])
            else:
                for cont in container_list:
                    try:
                        aux = my_engine.stats(cont['id'], stream=False)  # this call is very slow (>~1sec)
                        if 'usage' in aux['memory_stats']:
                            stats[cont['id']]['mem_usage'] = aux['memory_stats']['usage']
                        else:
                            stats[cont['id']]['mem_usage'] = 0
                        stats[cont['id']]['cpu_usage'] = self._get_core_usage(aux)
                    except ZoeException:
                        continue

                node_stats.memory_in_use = sum([stat['mem_usage'] for stat in stats.values()])
                node_stats.cores_in_use = sum([stat['cpu_usage'] for stat in stats.values()])
        else:
            node_stats.memory_in_use = 0
            node_stats.cores_in_use = 0
Ejemplo n.º 2
0
 def update_service(self, service, cores=None, memory=None):
     """Update a service reservation."""
     conf = self._get_config(service.backend_host)
     try:
         engine = DockerClient(conf)
     except ZoeException as e:
         log.error(str(e))
         return
     if service.backend_id is not None:
         info = engine.info()
         if cores is not None and cores > info['NCPU']:
             cores = info['NCPU']
         if memory is not None and memory > info['MemTotal']:
             memory = info['MemTotal']
         cpu_quota = int(cores * 100000)
         engine.update(service.backend_id, cpu_quota=cpu_quota, mem_reservation=memory)
     else:
         log.error('Cannot update reservations for service {} ({}), since it has no backend ID'.format(service.name, service.id))
Ejemplo n.º 3
0
    def _host_subthread(self, host_config: DockerHostConfig):
        log.info("Synchro thread for host {} started".format(host_config.name))

        self.host_stats[host_config.name] = NodeStats(host_config.name)

        while True:
            time_start = time.time()
            try:
                my_engine = DockerClient(host_config)
                container_list = my_engine.list(
                    only_label={
                        'zoe_deployment_name': get_conf().deployment_name
                    })
                info = my_engine.info()
            except ZoeException as e:
                self.host_stats[host_config.name].status = 'offline'
                log.error(str(e))
                log.info('Node {} is offline'.format(host_config.name))
            else:
                if self.host_stats[host_config.name].status == 'offline':
                    log.info('Node {} is now online'.format(host_config.name))
                    self.host_stats[host_config.name].status = 'online'

                self.host_stats[
                    host_config.name].container_count = info['Containers']
                self.host_stats[host_config.name].cores_total = info['NCPU']
                self.host_stats[
                    host_config.name].memory_total = info['MemTotal']
                self.host_stats[host_config.name].labels = host_config.labels
                if info['Labels'] is not None:
                    self.host_stats[host_config.name].labels.union(
                        set(info['Labels']))

                self.host_stats[host_config.name].memory_allocated = sum([
                    cont['memory_soft_limit'] for cont in container_list
                    if cont['memory_soft_limit'] != info['MemTotal']
                ])
                self.host_stats[host_config.name].cores_allocated = sum([
                    cont['cpu_quota'] / cont['cpu_period']
                    for cont in container_list if cont['cpu_period'] != 0
                ])

                stats = {}
                self.host_stats[host_config.name].memory_reserved = 0
                self.host_stats[host_config.name].cores_reserved = 0
                for cont in container_list:
                    service = self.state.services.select(
                        only_one=True,
                        backend_host=host_config.name,
                        backend_id=cont['id'])
                    if service is None:
                        log.warning(
                            'Container {} on host {} has no corresponding service'
                            .format(cont['name'], host_config.name))
                        if cont['state'] == Service.BACKEND_DIE_STATUS:
                            log.warning(
                                'Terminating dead and orphan container {}'.
                                format(cont['name']))
                            my_engine.terminate_container(cont['id'],
                                                          delete=True)
                        continue
                    self._update_service_status(service, cont)
                    self.host_stats[
                        host_config.
                        name].memory_reserved += service.resource_reservation.memory.min
                    self.host_stats[
                        host_config.
                        name].cores_reserved += service.resource_reservation.cores.min
                    stats[service.id] = {
                        'core_limit': cont['cpu_quota'] / cont['cpu_period'],
                        'mem_limit': cont['memory_soft_limit']
                    }
                self.host_stats[host_config.name].service_stats = stats

                self.host_stats[host_config.name].images = []
                for dk_image in my_engine.list_images():
                    image = {
                        'id': dk_image.attrs['Id'],
                        'size': dk_image.attrs['Size'],
                        'names': dk_image.tags  # type: list
                    }
                    for name in image['names']:
                        if name[-7:] == ':latest':  # add an image with the name without 'latest' to fake Docker image lookup algorithm
                            image['names'].append(name[:-7])
                            break
                    self.host_stats[host_config.name].images.append(image)

            sleep_time = CHECK_INTERVAL - (time.time() - time_start)
            if sleep_time <= 0:
                log.warning(
                    'synchro thread for host {} is late by {:.2f} seconds'.
                    format(host_config.name, sleep_time * -1))
                sleep_time = 0
            if self.stop.wait(timeout=sleep_time):
                break

        log.info("Synchro thread for host {} stopped".format(host_config.name))