Beispiel #1
0
def profiling(url, pod_ip, ana_window='2m', metrics=MEM_UTIL):
    """if key exists, the value will be replaced,
       add dynamic status
       {ai.centaurus.io/gpu0:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:1},
        ai.centaurus.io/gpu1:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:2, processes:[{pid:25678, cur_mem_used:3GB},{pid:67234, cur_mem_used:1GB}]}                                 
       }
    """
    ret_dict = dict()
    promi = PrometheusConnect(url=url, disable_ssl=True)
    # except connection error
    try:
        promi.check_prometheus_connection()
    except Exception as e:
        logging.error(e)
        return ret_dict  # if connectioin fails, return empty dict
    instance = pod_ip + ":9400" # tmp fixed
    start_time = parse_datetime(ana_window)
    end_time = parse_datetime("now")
    my_label_config = {"instance": instance}  # select current host metrics
    metric_data = promi.get_metric_range_data(metric_name=metrics,
                                              label_config=my_label_config,
                                              start_time=start_time,
                                              end_time=end_time)
    # reorganize data to label_config and metric_values
    metric_object_list = MetricsList(metric_data)
    ret_dict = dict()
    for item in metric_object_list: # iterate through all the gpus on the node
        if 'gpu' not in item.label_config: # handle metric config info exception
            continue
        id = item.label_config['gpu']  # predefined key from dcgm (gpu index)
        # ip = item.label_config['instance']
        key = DOMAIN + "/gpu-" + id
        cur_usage = collect_cur_usage(int(id))
        ts = item.metric_values.iloc[:, 1]  # metrics_values are two row df, 1st is timestamp, 2nd is value
        cur_usage['cyclic_pattern'] = False
        if ts.max() > 0:
            cyclic, period = cyclic_pattern_detection(ts)
            if cyclic:
                cur_usage['cyclic_pattern'] = True
                cur_usage['period'] = str(period)       
        cur_usage['max_mem_util'] = str(ts.max())
        # Important: flatten nested dictionary to string, otherwise error "cannot unmarshal string into Go value of type map[string]interface {}""
        ret_dict[key] = str(cur_usage)
    return ret_dict
Beispiel #2
0
def get_all_metrics(start_time='5m', end_time='now', instance='', gpu_id=''):
    """
    all DCGM metrics, on all instances, and all gpus
    save dumped data to csv file
    """
    # save the time first, in case multiple query at different time later
    start_time = parse_datetime(start_time)
    end_time = parse_datetime(end_time)
    # connect to premtheus server, exit if connection fails
    url = "http://prometheus:9090"  # use service name, instead of ip to be more robust
    prom = PrometheusConnect(url=url, disable_ssl=True)
    try:
        prom.check_prometheus_connection()
    except Exception as e:
        logging.error(e)
        exit(1)
    # get all metrics under profiler job, note: some instances/gpus may not have all the metrics due to model variance
    metrics = prom.all_metrics()
    metrics = [a for a in metrics if 'DCGM' in a]
    gpu_util = 'DCGM_FI_DEV_GPU_UTIL'
    label_cfg = {"job": "profiler-pods"}
    # get a screenshot of all the instances (pod ip)
    metric_data = prom.get_current_metric_value(metric_name=gpu_util,
                                                label_config=label_cfg)
    metric_df = MetricSnapshotDataFrame(metric_data)
    instances = metric_df.instance.unique()
    ins_gpu = dict()
    for ins in instances:
        # add instance in query
        label_cfg['instance'] = ins
        metric_data = prom.get_current_metric_value(metric_name=gpu_util,
                                                    label_config=label_cfg)
        metric_df = MetricSnapshotDataFrame(metric_data)
        gpus = metric_df.gpu.unique()
        # put each instance's gpus into dictionary
        ins_gpu[ins] = gpus

    my_label_config = {"job": "profiler-pods", "gpu": gpu_id}  # select gpu0
    #my_label_config = {"instance": instance}  # select all gpu
    # if one particular instance is given, update instances
    if instance != '':
        instances = [
            instance,
        ]
    for ins in instances:
        if gpu_id != '':
            gpus = [
                gpu_id,
            ]
        else:
            gpus = ins_gpu[ins]
            print(ins, gpus)
        for gpu in gpus:
            my_label_config = {"instance": ins, "gpu": gpu}
            df = pd.DataFrame()
            for metric_name in metrics:
                # select from different metric_name to query
                metric_data = prom.get_metric_range_data(
                    metric_name=metric_name,
                    label_config=my_label_config,
                    start_time=parse_datetime(start_time),
                    end_time=parse_datetime(end_time))

                # reorganize data to label_config and metric_values
                metric_object_list = MetricsList(metric_data)
                if len(metric_object_list) > 0:
                    if 'datetime' not in df.columns:
                        df['datetime'] = metric_object_list[0].metric_values[
                            'ds']
                    df[metric_name] = metric_object_list[0].metric_values['y']

            file_name = "_".join([ins, gpu]) + ".csv"
            df.to_csv(file_name)