def get_container_diskio_usage(docker_id, start, end): """获取磁盘IO数据 start, end单位为毫秒,和数据平台保持一致 """ start = start // 1000 end = end // 1000 step = (end - start) // 60 porm_query = f'sum by (id, name) (rate(container_fs_reads_bytes_total{{id=~".*{ docker_id }.*"}}[1m]))' read = query_range(porm_query, start, end, step) if read.get('status') != 'success': return [] porm_query = f'sum by (id, name) (rate(container_fs_writes_bytes_total{{id=~".*{ docker_id }.*"}}[1m]))' writes = query_range(porm_query, start, end, step) if writes.get('status') != 'success': return [] if not read['data']['result'] or not writes['data']['result']: return [] data = [] metric = read['data']['result'][0]['metric'] read_dict = dict(read['data']['result'][0]['values']) writes_dict = dict(writes['data']['result'][0]['values']) for k, v in read_dict.items(): data.append({ 'time': k * 1000, 'used_pct': 0, # 兼容字段 'container_name': metric['name'], 'read_bytes': normalize_metric(read_dict.get(k, 0)), # 转化为Bytes 'write_bytes': normalize_metric(writes_dict.get(k, 0)) # 转化为Bytes }) return data
def get_bk_data(self, data, cc_app_id): """数据平台返回 """ _data = apigw_data.get_node_metrics( data['metric'], cc_app_id, data['res_id'], data['start_at'], data['end_at']) metrics_data = [] if data['metric'] == 'io': metric_list = groupby(sorted(_data['list'], key=lambda x: x['device_name']), key=lambda x: x['device_name']) for device_name, metrics in metric_list: metrics_data.append({'device_name': device_name, 'metrics': [{'rkb_s': normalize_metric(i['rkb_s']), 'wkb_s': normalize_metric(i['wkb_s']), 'time': i['time']} for i in metrics]}) _data['list'] = metrics_data elif data['metric'] == 'cpu_summary': for i in _data['list']: i['usage'] = normalize_metric(i['usage']) elif data['metric'] == 'net': metric_list = groupby(sorted(_data['list'], key=lambda x: x['device_name']), key=lambda x: x['device_name']) for device_name, metrics in metric_list: metrics_data.append({'device_name': device_name, 'metrics': [{'speedSent': i['speedSent'], 'speedRecv': i['speedRecv'], 'time': i['time']} for i in metrics]}) _data['list'] = metrics_data return _data
def get_container_network_usage(docker_id, start, end): """获取网络数据 start, end单位为毫秒,和数据平台保持一致 """ start = start // 1000 end = end // 1000 step = (end - start) // 60 porm_query = f'sum by (id, name) (rate(container_network_receive_bytes_total{{id=~".*{ docker_id }.*"}}[1m]))' receive = query_range(porm_query, start, end, step) if receive.get('status') != 'success': return [] porm_query = f'sum by (id, name) (rate(container_network_transmit_bytes_total{{id=~".*{ docker_id }.*"}}[1m]))' transmit = query_range(porm_query, start, end, step) if transmit.get('status') != 'success': return [] if not receive['data']['result'] or not transmit['data']['result']: return [] data = [] metric = receive['data']['result'][0]['metric'] receive_dict = dict(receive['data']['result'][0]['values']) transmit_dict = dict(transmit['data']['result'][0]['values']) for k, v in receive_dict.items(): data.append({ 'time': k * 1000, 'container_name': metric['name'], 'rxbytes': normalize_metric(receive_dict.get(k, 0)), 'txbytes': normalize_metric(transmit_dict.get(k, 0)) }) return data
def get_bk_data(self, data, cc_app_id): _data = apigw_data.get_docker_metrics(data['metric'], cc_app_id, data['res_id'], data['start_at'], data['end_at']) metrics_data = [] if data['metric'] == 'disk': metric_list = groupby(sorted(_data['list'], key=lambda x: x['device_name']), key=lambda x: x['device_name']) for device_name, metrics in metric_list: metrics_data.append({ 'device_name': device_name, 'metrics': [{ 'used_pct': normalize_metric(i['used_pct']), 'time': i['time'] } for i in metrics] }) _data['list'] = metrics_data elif data['metric'] == 'cpu_summary': for i in _data['list']: i['usage'] = normalize_metric(i.get('cpu_totalusage')) i.pop('cpu_totalusage', None) elif data['metric'] == 'mem': for i in _data['list']: i['rss_pct'] = normalize_metric(i['rss_pct']) elif data['metric'] == 'net': for i in _data['list']: i['rxpackets'] = int(i['rxpackets']) i['txbytes'] = int(i['txbytes']) i['rxbytes'] = int(i['rxbytes']) i['txpackets'] = int(i['txpackets']) return _data
def get_node_memory_usage(ip, start, end): """获取CPU总使用率 start, end单位为毫秒,和数据平台保持一致 """ start = start // 1000 end = end // 1000 step = (end - start) // 60 porm_query = f'node_memory_MemTotal{{job="node-exporter", instance=~"{ ip }:9100"}}' total = query_range(porm_query, start, end, step) if total.get('status') != 'success': return [] porm_query = f""" node_memory_MemTotal{{job="node-exporter", instance=~"{ ip }:9100"}} - node_memory_MemFree{{job="node-exporter", instance=~"{ ip }:9100"}} - node_memory_Buffers{{job="node-exporter", instance=~"{ ip }:9100"}} - node_memory_Cached{{job="node-exporter", instance=~"{ ip }:9100"}}""" usage = query_range(porm_query, start, end, step) if usage.get('status') != 'success': return [] if not total['data']['result'] or not usage['data']['result']: return [] data = [] total_dict = dict(total['data']['result'][0]['values']) usage_dict = dict(usage['data']['result'][0]['values']) for k, v in total_dict.items(): data.append({ 'time': k * 1000, 'total': normalize_metric(total_dict.get(k, 0)), 'used': normalize_metric(usage_dict.get(k, 0)) }) return data
def get_node_network_usage(ip, start, end): """获取网络数据 start, end单位为毫秒,和数据平台保持一致 数据单位KB/s """ start = start // 1000 end = end // 1000 step = (end - start) // 60 porm_query = f'max by (instance) (rate(node_network_receive_bytes{{job="node-exporter", instance=~"{ ip }:9100"}}[5m]))' # noqa receive = query_range(porm_query, start, end, step) if receive.get('status') != 'success': return [] porm_query = f'max by (instance) (rate(node_network_transmit_bytes{{job="node-exporter", instance=~"{ ip }:9100"}}[5m]))' # noqa transmit = query_range(porm_query, start, end, step) if transmit.get('status') != 'success': return [] if not receive['data']['result'] or not transmit['data']['result']: return [] data = [] receive_dict = dict(receive['data']['result'][0]['values']) transmit_dict = dict(transmit['data']['result'][0]['values']) for k, v in receive_dict.items(): data.append({ 'time': k * 1000, 'speedRecv': normalize_metric(float(receive_dict.get(k, 0)) / 1024), 'speedSent': normalize_metric(float(transmit_dict.get(k, 0)) / 1024) }) return data
def get_node_diskio_usage(ip, start, end): """获取磁盘IO数据 start, end单位为毫秒,和数据平台保持一致 数据单位KB/s """ start = start // 1000 end = end // 1000 step = (end - start) // 60 porm_query = f'max by (instance) (rate(node_disk_bytes_read{{job="node-exporter", instance=~"{ ip }:.*"}}[5m]))' read = query_range(porm_query, start, end, step) if read.get('status') != 'success': return [] porm_query = f'max by (instance) (rate(node_disk_bytes_written{{job="node-exporter", instance=~"{ ip }:.*"}}[5m]))' written = query_range(porm_query, start, end, step) if written.get('status') != 'success': return [] if not read['data']['result'] or not written['data']['result']: return [] read_dict = dict(read['data']['result'][0]['values']) written_dict = dict(written['data']['result'][0]['values']) data = [] for k, v in read_dict.items(): data.append({ 'time': k * 1000, 'rkb_s': normalize_metric(float(read_dict.get(k, 0)) / 1024), 'wkb_s': normalize_metric(float(written_dict.get(k, 0)) / 1024) }) return data
def cluster_info(self, request, project_id, cluster_id): # can view cluster self.can_view_cluster(request, project_id, cluster_id) cluster = self.get_cluster(request, project_id, cluster_id) cluster["cluster_name"] = cluster.get("name") cluster["created_at"] = normalize_datetime(cluster["created_at"]) cluster["updated_at"] = normalize_datetime(cluster["updated_at"]) status = cluster.get("status", "normal") cluster["chinese_status_name"] = ClusterStatusName[status].value # get area info area_info = self.get_area(request, cluster.get("area_id")) cluster["area_name"] = _(area_info.get("chinese_name")) # get master count cluster["master_count"] = self.get_master_count(request, project_id, cluster_id) # get node count cluster["node_count"] = self.get_node_count(request, project_id, cluster_id) if request.project.kind == app_constants.MESOS_KIND: # mesos单位是MB,需要转换为GB total_mem = normalize_metric(cluster["total_mem"] / 1024) else: total_mem = normalize_metric(cluster["total_mem"]) cluster["total_mem"] = total_mem # 获取集群调度引擎 coes = cluster["type"] # 补充tke和bcs k8s相关配置 if coes == ClusterCOES.TKE.value: cluster.update(self.get_tke_cluster_config(request, project_id, cluster_id)) elif coes == ClusterCOES.BCS_K8S.value: k8s_client = bcs.k8s.K8SClient(request.user.token.access_token, project_id, cluster_id, None) cluster["version"] = k8s_client.version return response.Response(cluster)
def get_multi_bk_data(self, data, cc_app_id): _data = apigw_data.get_docker_metrics( data['metric'], cc_app_id, data['res_id_list'], data['start_at'], data['end_at'] ) metrics_data = [] if data['metric'] == 'cpu_summary': metric_list = groupby(sorted(_data['list'], key=lambda x: x['id']), key=lambda x: x['id']) for _id, metrics in metric_list: container_name = '' _metrics = [] for i in metrics: container_name = i['container_name'] _metrics.append( { 'usage': normalize_metric(i.get('cpu_totalusage')), 'time': i['time'], } ) metrics_data.append({'id': _id, 'container_name': container_name, 'metrics': _metrics}) _data['list'] = metrics_data elif data['metric'] == 'mem': metric_list = groupby(sorted(_data['list'], key=lambda x: x['id']), key=lambda x: x['id']) for _id, metrics in metric_list: container_name = '' _metrics = [] for i in metrics: container_name = i['container_name'] _metrics.append({'rss_pct': normalize_metric(i['rss_pct']), 'time': i['time']}) metrics_data.append({'id': _id, 'container_name': container_name, 'metrics': _metrics}) _data['list'] = metrics_data return _data
def compose_metric(self, request, cluster_data): metrics_cpu, metrics_mem, metrics_disk = [], [], [] for i in cluster_data['results']: time = arrow.get(i['capacity_updated_at']).timestamp * 1000 metrics_cpu.append({ 'time': time, 'remain_cpu': num_transform(i['remain_cpu']), 'total_cpu': i['total_cpu'] }) total_mem = normalize_metric(i['total_mem']) remain_mem = normalize_metric(num_transform(i['remain_mem'])) metrics_mem.append({ 'time': time, 'remain_mem': remain_mem, 'total_mem': total_mem }) # add disk metric metrics_disk.append({ 'time': time, 'remain_disk': normalize_metric(num_transform(i['remain_disk']) / 1024), 'total_disk': normalize_metric(i['total_disk'] / 1024), }) return metrics_cpu, metrics_mem, metrics_disk
def get_bk_data(self, res_id, cc_app_id): """数据平台""" cpu_metrics = apigw_data.get_node_metrics('cpu_summary', cc_app_id, res_id, limit=1) if cpu_metrics['list']: cpu_metrics = normalize_metric(cpu_metrics['list'][0]['usage']) else: cpu_metrics = 0 mem_metrics = apigw_data.get_node_metrics('mem', cc_app_id, res_id, limit=1) if mem_metrics['list']: mem_metrics = normalize_metric(mem_metrics['list'][0]['used'] * 100.0 / mem_metrics['list'][0]['total']) else: mem_metrics = 0 # device_name 有很多,需要处理 io_metrics = apigw_data.get_node_metrics('io', cc_app_id, res_id, limit=1) if io_metrics['list']: io_metrics = normalize_metric(io_metrics['list'][0]['util']) else: io_metrics = 0 data = {'cpu': cpu_metrics, 'mem': mem_metrics, 'io': io_metrics} return data
def cluster_info(self, request, project_id, cluster_id): # can view cluster self.can_view_cluster(request, project_id, cluster_id) cluster = self.get_cluster(request, project_id, cluster_id) cluster["cluster_name"] = cluster.get("name") cluster["created_at"] = normalize_datetime(cluster["created_at"]) cluster["updated_at"] = normalize_datetime(cluster["updated_at"]) status = cluster.get("status", "normal") cluster["chinese_status_name"] = ClusterStatusName[status].value # get area info area_info = self.get_area(request, cluster.get("area_id")) cluster["area_name"] = _(area_info.get("chinese_name")) # get master count cluster["master_count"] = self.get_master_count( request, project_id, cluster_id) # get node count cluster["node_count"] = self.get_node_count(request, project_id, cluster_id) if request.project.kind == app_constants.MESOS_KIND: # mesos单位是MB,需要转换为GB total_mem = normalize_metric(cluster["total_mem"] / 1024) else: total_mem = normalize_metric(cluster["total_mem"]) cluster["total_mem"] = total_mem return response.Response(cluster)
def fixed_disk_usage_history(cluster_id): """k8s磁盘使用率 单位是 GB """ end = arrow.now().timestamp * 1000 start = end - 60 * 60 * 1000 cluster_list = [cluster_id] total, free = get_cluster_disk_usage(cluster_list, start, end) total_dist = total.get(cluster_id) or [] free_dist = free.get(cluster_id) or [] total_dict = dict(total_dist) free_dict = dict(free_dist) data = [] for k, v in total_dict.items(): data.append({ 'time': k * 1000, 'total_disk': normalize_metric( float(total_dict.get(k, 0)) / (1024 * 1024 * 1024)), 'remain_disk': normalize_metric( float(free_dict.get(k, 0)) / (1024 * 1024 * 1024)) }) return data
def cluster_info(self, request, project_id, cluster_id): # can view cluster self.can_view_cluster(request, project_id, cluster_id) cluster = self.get_cluster(request, project_id, cluster_id) cluster['cluster_name'] = cluster.get('name') cluster['created_at'] = normalize_datetime(cluster['created_at']) cluster['updated_at'] = normalize_datetime(cluster['updated_at']) status = cluster.get('status', 'normal') cluster['chinese_status_name'] = ClusterStatusName[status].value # get area info area_info = self.get_area(request, cluster.get('area_id')) cluster['area_name'] = _(area_info.get('chinese_name')) # get master count cluster['master_count'] = self.get_master_count( request, project_id, cluster_id) # get node count cluster['node_count'] = self.get_node_count(request, project_id, cluster_id) if request.project.kind == app_constants.MESOS_KIND: # mesos单位是MB,需要转换为GB total_mem = normalize_metric(cluster['total_mem'] / 1024) else: total_mem = normalize_metric(cluster['total_mem']) cluster['total_mem'] = total_mem return response.Response(cluster)
def cluster_info(self, request, project_id, cluster_id): # can view cluster self.can_view_cluster(request, project_id, cluster_id) cluster = self.get_cluster(request, project_id, cluster_id) cluster["cluster_name"] = cluster.get("name") cluster["created_at"] = normalize_datetime(cluster["created_at"]) cluster["updated_at"] = normalize_datetime(cluster["updated_at"]) status = cluster.get("status", "normal") cluster["chinese_status_name"] = ClusterStatusName[status].value # get area info area_info = self.get_area(request, cluster.get("area_id")) cluster["area_name"] = _(area_info.get("chinese_name")) # get master count cluster["master_count"] = self.get_master_count(request, project_id, cluster_id) # get node count cluster["node_count"] = self.get_node_count(request, project_id, cluster_id) total_mem = normalize_metric(cluster["total_mem"]) cluster["total_mem"] = total_mem # 获取集群调度引擎 coes = cluster["type"] # 补充tke和bcs k8s相关配置 if coes == ClusterCOES.TKE.value: cluster.update(self.get_tke_cluster_config(request, project_id, cluster_id)) cluster_version = self.query_cluster_version(request.user.token.access_token, project_id, cluster_id) # 通过集群查询集群版本,如果查询集群异常,则返回集群快照中的数据 if cluster_version: cluster["version"] = cluster_version return response.Response(cluster)
def get_prom_data(self, res_id): """prometheus数据 """ end_at = int(time.time()) * 1000 start_at = end_at - 60 * 10 * 1000 metric_data = prometheus.get_node_cpu_usage(res_id, start_at, end_at) if metric_data: cpu_metrics = metric_data[-1]['usage'] else: cpu_metrics = 0 metric_data = prometheus.get_node_disk_io_utils( res_id, start_at, end_at) if metric_data: io_metrics = metric_data[-1]['usage'] else: io_metrics = 0 metric_data = prometheus.get_node_memory_usage(res_id, start_at, end_at) if metric_data: mem_metrics = normalize_metric(metric_data[-1]['used'] * 100.0 / metric_data[-1]['total']) else: mem_metrics = 0 data = {'cpu': cpu_metrics, 'mem': mem_metrics, 'io': io_metrics} return data
def compose_metric(self, request, cluster_data): metrics_cpu, metrics_mem, metrics_disk = [], [], [] for i in cluster_data['results']: time = arrow.get(i['capacity_updated_at']).timestamp * 1000 metrics_cpu.append({ 'time': time, 'remain_cpu': num_transform(i['remain_cpu']), 'total_cpu': i['total_cpu'] }) if request.project.kind == app_constants.ProjectKind.MESOS.value: # transfer GB total_mem = normalize_metric(i['total_mem'] / 1024) remain_mem = normalize_metric( num_transform(i['remain_mem']) / 1024) else: total_mem = normalize_metric(i['total_mem']) remain_mem = normalize_metric(num_transform(i['remain_mem'])) metrics_mem.append({ 'time': time, 'remain_mem': remain_mem, 'total_mem': total_mem }) # add disk metric metrics_disk.append({ 'time': time, 'remain_disk': normalize_metric(num_transform(i['remain_disk']) / 1024), 'total_disk': normalize_metric(i['total_disk'] / 1024) }) return metrics_cpu, metrics_mem, metrics_disk
def fixed_disk_usage(cluster_data): """k8s磁盘使用率 单位是 GB """ end = arrow.now().timestamp * 1000 start = end - 15 * 60 * 1000 cluster_list = [i['cluster_id'] for i in cluster_data] total, free = get_cluster_disk_usage(cluster_list, start, end) for cluster in cluster_data: total_dist = total.get(cluster['cluster_id']) or [] total_dist = total_dist[-1][1] if total_dist else 0 total_disk = normalize_metric(float(total_dist) / (1024 * 1024 * 1024)) free_dist = free.get(cluster['cluster_id']) or [] free_dist = free_dist[-1][1] if free_dist else 0 free_disk = normalize_metric(float(free_dist) / (1024 * 1024 * 1024)) cluster['total_disk'] = total_disk cluster['remain_disk'] = free_disk return cluster_data
def get_node_disk_io_utils(ip, start, end): start = start // 1000 end = end // 1000 step = (end - start) // 60 porm_query = f'max by (instance) (rate(node_disk_io_time_seconds_total{{job="node-exporter", instance=~"{ ip }:9100"}}[5m]) * 100)' # noqa io_utils = query_range(porm_query, start, end, step) if io_utils.get('status') != 'success': return [] if not io_utils['data']['result']: return [] data = [] for i in io_utils['data']['result'][0]['values']: data.append({'time': i[0] * 1000, 'usage': normalize_metric(i[1])}) return data
def get_container_memory_usage(docker_id, start, end): """获取CPU总使用率 start, end单位为毫秒,和数据平台保持一致 """ start = start // 1000 end = end // 1000 step = (end - start) // 60 if isinstance(docker_id, list): docker_id_list = '|'.join('.*%s.*' % i for i in docker_id) else: docker_id_list = '.*%s.*' % docker_id porm_query = f'container_memory_usage_bytes{{id=~"{ docker_id_list }"}}' total = query_range(porm_query, start, end, step) if total.get('status') != 'success': return [] if not total['data']['result']: return [] data = [] for res in total['data']['result']: _data = res['metric'] _data['container_name'] = res['metric']['name'] metrics = [] for i in res['values']: metrics.append({ 'time': i[0] * 1000, 'rss_pct': 0, 'container_name': res['metric']['name'], 'used': normalize_metric(float(i[1]) / 1024 / 1024), 'unit': 'MB' }) _data['metrics'] = metrics data.append(_data) # 单个直接返回metrics的值 if isinstance(docker_id, list): return data else: return data[0]['metrics']
def get_container_cpu_usage(docker_id, start, end): """获取CPU总使用率 start, end单位为毫秒,和数据平台保持一致 """ start = start // 1000 end = end // 1000 step = (end - start) // 60 if isinstance(docker_id, list): docker_id_list = '|'.join('.*%s.*' % i for i in docker_id) else: docker_id_list = '.*%s.*' % docker_id porm_query = f'sum by (id, name) (rate(container_cpu_usage_seconds_total{{id=~"{ docker_id_list }"}}[1m]))' resp = query_range(porm_query, start, end, step) if resp.get('status') != 'success': return [] if not resp['data']['result']: return [] data = [] for res in resp['data']['result']: _data = res['metric'] _data['container_name'] = res['metric']['name'] metrics = [] for i in res['values']: metrics.append({ 'time': i[0] * 1000, 'container_name': res['metric']['name'], 'usage': normalize_metric(float(i[1]) * 100) }) _data['metrics'] = metrics data.append(_data) # 单个直接返回metrics的值 if isinstance(docker_id, list): return data else: return data[0]['metrics']
def get_node_cpu_usage(ip, start, end): """获取CPU总使用率 start, end单位为毫秒,和数据平台保持一致 """ start = start // 1000 end = end // 1000 step = (end - start) // 60 porm_query = f'avg by (instance) (sum by (cpu,instance) (irate(node_cpu{{job="node-exporter", mode!="idle", instance=~"{ ip }:9100"}}[2m])))' # noqa resp = query_range(porm_query, start, end, step) if resp.get('status') != 'success': return [] if not resp['data']['result']: return [] data = [] for i in resp['data']['result'][0]['values']: data.append({ 'time': i[0] * 1000, 'usage': normalize_metric(float(i[1]) * 100) }) return data