async def api_impl(container_id):
     container = DockerContainer(ctx.agent.docker, id=container_id)
     ret = await fetch_api_stats(container)
     if ret is None:
         return None
     cpu_used = nmget(ret, 'cpu_stats.cpu_usage.total_usage', 0) / 1e6
     return cpu_used
 async def api_impl(container_id):
     container = DockerContainer(ctx.agent.docker, id=container_id)
     ret = await fetch_api_stats(container)
     if ret is None:
         return None
     mem_cur_bytes = nmget(ret, 'memory_stats.usage', 0)
     io_read_bytes = 0
     io_write_bytes = 0
     for item in nmget(ret, 'blkio_stats.io_service_bytes_recursive',
                       []):
         if item['op'] == 'Read':
             io_read_bytes += item['value']
         elif item['op'] == 'Write':
             io_write_bytes += item['value']
     loop = current_loop()
     scratch_sz = await loop.run_in_executor(None, get_scratch_size,
                                             container_id)
     return mem_cur_bytes, io_read_bytes, io_write_bytes, scratch_sz
Beispiel #3
0
async def _collect_stats_api(container):
    try:
        ret = await container.stats(stream=False)
    except (DockerError, aiohttp.ClientResponseError):
        short_cid = container._id[:7]
        log.warning(
            f'cannot read stats: Docker stats API error for {short_cid}.')
        return None
    else:
        # API returned successfully but actually the result may be empty!
        if ret is None:
            return None
        if ret['preread'].startswith('0001-01-01'):
            return None
        cpu_used = nmget(ret, 'cpu_stats.cpu_usage.total_usage', 0) / 1e6
        mem_max_bytes = nmget(ret, 'memory_stats.max_usage', 0)
        mem_cur_bytes = nmget(ret, 'memory_stats.usage', 0)

        io_read_bytes = 0
        io_write_bytes = 0
        for item in nmget(ret, 'blkio_stats.io_service_bytes_recursive', []):
            if item['op'] == 'Read':
                io_read_bytes += item['value']
            elif item['op'] == 'Write':
                io_write_bytes += item['value']
        io_max_scratch_size = 0
        io_cur_scratch_size = 0

        net_rx_bytes = 0
        net_tx_bytes = 0
        for dev in nmget(ret, 'networks', {}).values():
            net_rx_bytes += dev['rx_bytes']
            net_tx_bytes += dev['tx_bytes']
    return ContainerStat(
        cpu_used,
        mem_max_bytes,
        mem_cur_bytes,
        net_rx_bytes,
        net_tx_bytes,
        io_read_bytes,
        io_write_bytes,
        io_max_scratch_size,
        io_cur_scratch_size,
    )
Beispiel #4
0
    async def __ainit__(self) -> None:
        # Start serving requests.
        await self.update_status('starting')

        if not self.skip_detect_manager:
            await self.detect_manager()

        await self.read_agent_config()
        await self.read_agent_config_container()

        self.stats_monitor = StatsPluginContext(self.etcd, self.local_config)
        self.error_monitor = ErrorPluginContext(self.etcd, self.local_config)
        await self.stats_monitor.init()
        await self.error_monitor.init()

        backend = self.local_config['agent']['backend']
        agent_mod = importlib.import_module(f"ai.backend.agent.{backend.value}")
        self.agent = await agent_mod.get_agent_cls().new(  # type: ignore
            self.etcd,
            self.local_config,
            stats_monitor=self.stats_monitor,
            error_monitor=self.error_monitor,
        )

        rpc_addr = self.local_config['agent']['rpc-listen-addr']
        self.rpc_server = Peer(
            bind=ZeroMQAddress(f"tcp://{rpc_addr}"),
            transport=ZeroMQRPCTransport,
            scheduler=ExitOrderedAsyncScheduler(),
            serializer=msgpack.packb,
            deserializer=msgpack.unpackb,
            debug_rpc=self.local_config['debug']['enabled'],
        )
        for func_name in self.rpc_function.functions:
            self.rpc_server.handle_function(func_name, getattr(self, func_name))
        log.info('started handling RPC requests at {}', rpc_addr)

        await self.etcd.put('ip', rpc_addr.host, scope=ConfigScopes.NODE)
        watcher_port = utils.nmget(self.local_config, 'watcher.service-addr.port', None)
        if watcher_port is not None:
            await self.etcd.put('watcher_port', watcher_port, scope=ConfigScopes.NODE)

        await self.update_status('running')
Beispiel #5
0
async def get_time_binned_monthly_stats(request, user_uuid=None):
    '''
    Generate time-binned (15 min) stats for the last one month (2880 points).
    The structure of the result would be:

        [
          # [
          #     timestamp, num_sessions,
          #     cpu_allocated, mem_allocated, gpu_allocated,
          #     io_read, io_write, scratch_used,
          # ]
            [1562083808.657106, 1, 1.2, 1073741824, ...],
            [1562084708.657106, 2, 4.0, 1073741824, ...],
        ]

    Note that the timestamp is in UNIX-timestamp.
    '''
    # Get all or user kernels for the last month from DB.
    time_window = 900  # 15 min
    now = datetime.now(tzutc())
    start_date = now - timedelta(days=30)
    async with request.app['dbpool'].acquire() as conn, conn.begin():
        query = (sa.select([kernels]).select_from(kernels).where(
            (kernels.c.terminated_at >= start_date)
            & (kernels.c.status.in_(RESOURCE_USAGE_KERNEL_STATUSES))).order_by(
                sa.asc(kernels.c.created_at)))
        if user_uuid is not None:
            query = query.where(kernels.c.user_uuid == user_uuid)
        result = await conn.execute(query)
        rows = await result.fetchall()

    # Build time-series of time-binned stats.
    rowcount = result.rowcount
    now = now.timestamp()
    start_date = start_date.timestamp()
    ts = start_date
    idx = 0
    tseries = []
    # Iterate over each time window.
    while ts < now:
        # Initialize the time-binned stats.
        num_sessions = 0
        cpu_allocated = 0
        mem_allocated = 0
        gpu_allocated = 0
        io_read_bytes = 0
        io_write_bytes = 0
        disk_used = 0
        # Accumulate stats for containers overlapping with this time window.
        while idx < rowcount and \
              ts + time_window > rows[idx].created_at.timestamp() and \
              ts < rows[idx].terminated_at.timestamp():
            # Accumulate stats for overlapping containers in this time window.
            row = rows[idx]
            num_sessions += 1
            cpu_allocated += float(row.occupied_slots.get('cpu', 0))
            mem_allocated += float(row.occupied_slots.get('mem', 0))
            if 'cuda.devices' in row.occupied_slots:
                gpu_allocated += float(row.occupied_slots['cuda.devices'])
            if 'cuda.shares' in row.occupied_slots:
                gpu_allocated += float(row.occupied_slots['cuda.shares'])
            if row.last_stat:
                io_read_bytes += int(nmget(row.last_stat, 'io_read.current',
                                           0))
                io_write_bytes += int(
                    nmget(row.last_stat, 'io_write.current', 0))
                disk_used += int(
                    nmget(row.last_stat, 'io_scratch_size/stats.max', {}, '/'))
            idx += 1
        stat = {
            "date": ts,
            "num_sessions": {
                "value": num_sessions,
                "unit_hint": "count"
            },
            "cpu_allocated": {
                "value": cpu_allocated,
                "unit_hint": "count"
            },
            "mem_allocated": {
                "value": mem_allocated,
                "unit_hint": "bytes"
            },
            "gpu_allocated": {
                "value": gpu_allocated,
                "unit_hint": "count"
            },
            "io_read_bytes": {
                "value": io_read_bytes,
                "unit_hint": "bytes"
            },
            "io_write_bytes": {
                "value": io_write_bytes,
                "unit_hint": "bytes"
            },
            "disk_used": {
                "value ": disk_used,
                "unit_hint": "bytes"
            }
        }
        tseries.append(stat)
        ts += time_window
    return tseries
Beispiel #6
0
async def get_container_stats_for_period(request,
                                         start_date,
                                         end_date,
                                         group_ids=None):
    async with request.app['dbpool'].acquire() as conn, conn.begin():
        j = (kernels.join(groups, groups.c.id == kernels.c.group_id).join(
            users, users.c.uuid == kernels.c.user_uuid))
        query = (
            sa.select([
                kernels, groups.c.name, users.c.email
            ]).select_from(j).where(
                # Filter sessions which existence period overlaps with requested period
                ((kernels.c.terminated_at >= start_date)
                 & (kernels.c.created_at < end_date)
                 & (kernels.c.status.in_(RESOURCE_USAGE_KERNEL_STATUSES))) |
                # Or, filter running sessions which created before requested end_date
                ((kernels.c.created_at < end_date)
                 & (kernels.c.status.in_(LIVE_STATUS)))).order_by(
                     sa.asc(kernels.c.terminated_at)))
        if group_ids:
            query = query.where(kernels.c.group_id.in_(group_ids))
        result = await conn.execute(query)
        rows = await result.fetchall()

    objs_per_group = {}
    local_tz = request.app['config']['system']['timezone']

    for row in rows:
        group_id = str(row.group_id)
        last_stat = row.last_stat
        nfs = None
        if row.mounts is not None:
            nfs = list(set([mount[1] for mount in row.mounts]))
        if row['terminated_at'] is None:
            used_time = used_days = None
        else:
            used_time = str(row['terminated_at'] - row['created_at'])
            used_days = (
                row['terminated_at'].astimezone(local_tz).toordinal() -
                row['created_at'].astimezone(local_tz).toordinal() + 1)
        device_type = set()
        smp = 0
        gpu_mem_allocated = 0
        if row.attached_devices and row.attached_devices.get('cuda'):
            for dev_info in row.attached_devices['cuda']:
                if dev_info.get('model_name'):
                    device_type.add(dev_info['model_name'])
                smp += dev_info.get('smp', 0)
                gpu_mem_allocated += dev_info.get('mem', 0)
        gpu_allocated = 0
        if 'cuda.devices' in row.occupied_slots:
            gpu_allocated = row.occupied_slots['cuda.devices']
        if 'cuda.shares' in row.occupied_slots:
            gpu_allocated = row.occupied_slots['cuda.shares']
        c_info = {
            'id':
            str(row['id']),
            'container_id':
            row['container_id'],
            'domain_name':
            row['domain_name'],
            'group_id':
            str(row['group_id']),
            'group_name':
            row['name'],
            'name':
            row['sess_id'],
            'access_key':
            row['access_key'],
            'email':
            row['email'],
            'agent':
            row['agent'],
            'cpu_allocated':
            float(row.occupied_slots.get('cpu', 0)),
            'cpu_used':
            float(nmget(last_stat, 'cpu_used.current', 0)),
            'mem_allocated':
            int(row.occupied_slots.get('mem', 0)),
            'mem_used':
            int(nmget(last_stat, 'mem.capacity', 0)),
            'shared_memory':
            int(nmget(row.resource_opts, 'shmem', 0)),
            'disk_allocated':
            0,  # TODO: disk quota limit
            'disk_used':
            (int(nmget(last_stat, 'io_scratch_size/stats.max', 0, '/'))),
            'io_read':
            int(nmget(last_stat, 'io_read.current', 0)),
            'io_write':
            int(nmget(last_stat, 'io_write.current', 0)),
            'used_time':
            used_time,
            'used_days':
            used_days,
            'device_type':
            list(device_type),
            'smp':
            float(smp),
            'gpu_mem_allocated':
            float(gpu_mem_allocated),
            'gpu_allocated':
            float(gpu_allocated),  # devices or shares
            'nfs':
            nfs,
            'image_id':
            row['image'],  # TODO: image id
            'image_name':
            row['image'],
            'created_at':
            str(row['created_at']),
            'terminated_at':
            str(row['terminated_at']),
            'status':
            row['status'].name,
            'status_changed':
            str(row['status_changed']),
        }
        if group_id not in objs_per_group:
            objs_per_group[group_id] = {
                'domain_name': row['domain_name'],
                'g_id': group_id,
                'g_name': row['name'],  # this is group's name
                'g_cpu_allocated': c_info['cpu_allocated'],
                'g_cpu_used': c_info['cpu_used'],
                'g_mem_allocated': c_info['mem_allocated'],
                'g_mem_used': c_info['mem_used'],
                'g_shared_memory': c_info['shared_memory'],
                'g_disk_allocated': c_info['disk_allocated'],
                'g_disk_used': c_info['disk_used'],
                'g_io_read': c_info['io_read'],
                'g_io_write': c_info['io_write'],
                'g_device_type': copy.deepcopy(c_info['device_type']),
                'g_smp': c_info['smp'],
                'g_gpu_mem_allocated': c_info['gpu_mem_allocated'],
                'g_gpu_allocated': c_info['gpu_allocated'],
                'c_infos': [c_info],
            }
        else:
            objs_per_group[group_id]['g_cpu_allocated'] += c_info[
                'cpu_allocated']
            objs_per_group[group_id]['g_cpu_used'] += c_info['cpu_used']
            objs_per_group[group_id]['g_mem_allocated'] += c_info[
                'mem_allocated']
            objs_per_group[group_id]['g_mem_used'] += c_info['mem_used']
            objs_per_group[group_id]['g_shared_memory'] += c_info[
                'shared_memory']
            objs_per_group[group_id]['g_disk_allocated'] += c_info[
                'disk_allocated']
            objs_per_group[group_id]['g_disk_used'] += c_info['disk_used']
            objs_per_group[group_id]['g_io_read'] += c_info['io_read']
            objs_per_group[group_id]['g_io_write'] += c_info['io_write']
            for device in c_info['device_type']:
                if device not in objs_per_group[group_id]['g_device_type']:
                    g_dev_type = objs_per_group[group_id]['g_device_type']
                    g_dev_type.append(device)
                    objs_per_group[group_id]['g_device_type'] = list(
                        set(g_dev_type))
            objs_per_group[group_id]['g_smp'] += c_info['smp']
            objs_per_group[group_id]['g_gpu_mem_allocated'] += c_info[
                'gpu_mem_allocated']
            objs_per_group[group_id]['g_gpu_allocated'] += c_info[
                'gpu_allocated']
            objs_per_group[group_id]['c_infos'].append(c_info)
    return list(objs_per_group.values())