コード例 #1
0
def update_saved_prom_metrics(metrics, save_dir):
    # connect to prometheus
    prom_url = os.getenv("FLT_PROM_URL",
                         "https://telemeter-lts.datahub.redhat.com/")
    prom_access_token = os.getenv("FLT_PROM_ACCESS_TOKEN")
    pc = PrometheusConnect(
        url=prom_url,
        headers={"Authorization": f"bearer {prom_access_token}"},
        disable_ssl=True,
    )

    # get metrics if avaiable
    if "cluster_operator_conditions" in metrics:
        conditions_df = metric_preprocessors.opconds_metrics_to_df(
            metrics_raw=pc.get_current_metric_value(
                "cluster_operator_conditions"))
    if "cluster_installer" in metrics:
        install_df = metric_preprocessors.installer_metrics_to_df(
            metrics_raw=pc.get_current_metric_value("cluster_installer"))
    if "cluster_version" in metrics:
        versions_df = metric_preprocessors.version_metrics_to_df(
            metrics_raw=pc.get_current_metric_value("cluster_version"))

    # combine all metrics
    metrics_df = conditions_df.merge(install_df,
                                     how="left",
                                     left_index=True,
                                     right_index=True)
    metrics_df = metrics_df.merge(versions_df,
                                  how="left",
                                  left_index=True,
                                  right_index=True)

    # nans because some install types are neither upi nor ipi (unknown)
    metrics_df["install_type_IPI"] = metrics_df["install_type_IPI"].fillna(0)
    metrics_df["install_type_UPI"] = metrics_df["install_type_UPI"].fillna(0)

    # save to volume
    metrics_df.to_parquet(
        fname=os.path.join(save_dir, "metrics.parquet"),
        engine="pyarrow",
        index=True,
    )
コード例 #2
0
ファイル: get_data.py プロジェクト: husky-parul/paocp
def pro():
    pc = PrometheusConnect(
        url="https://prometheus-k8s-openshift-monitoring.apps-crc.testing",
        headers={
            "Authorization":
            "bearer BSI2W0euoJWYRAvT0ZnSJVmgNQ87pl3o3yXuyy38qAg"
        },
        disable_ssl=True)
    up_metric = MetricsList(
        pc.get_current_metric_value(
            metric_name="haproxy_backend_up{exported_namespace='prophet'}"))
    print(up_metric[0])
コード例 #3
0
from configuration import Configuration
from graph_handler import GraphHandler
import schedule

_LOGGER = logging.getLogger(__name__)

PREDICTOR_MODEL_LIST = list()

pc = PrometheusConnect(
    url=Configuration.prometheus_url,
    headers=Configuration.prometheus_headers,
    disable_ssl=True,
)

for metric in Configuration.metrics_list:
    metric_init = pc.get_current_metric_value(metric_name=metric)

    for unique_metric in metric_init:
        PREDICTOR_MODEL_LIST.append(
            Configuration.algorithm(
                unique_metric,
                rolling_data_window_size=Configuration.
                rolling_training_window_size,
            ))

GAUGE_DICT = dict()
for predictor in PREDICTOR_MODEL_LIST:
    unique_metric = predictor.metric
    label_list = list(unique_metric.label_config.keys())
    label_list.append("value_type")
    if unique_metric.metric_name not in GAUGE_DICT:
class TestPrometheusConnectWithMockedNetwork(BaseMockedNetworkTestcase):
    """
    Network is blocked in this testcase, see base class
    """
    def setUp(self):
        self.pc = PrometheusConnect(url='http://doesnt_matter.xyz',
                                    disable_ssl=True)

    def test_network_is_blocked(self):
        resp = requests.get('https://google.com')
        self.assertEqual(resp.status_code, 403)
        self.assertEqual(resp.text, 'BOOM!')

    def test_how_mock_prop_works(self):
        with self.mock_response('kekekeke', status_code=500) as handler:
            self.assertEqual(len(handler.requests), 0)
            resp = requests.get('https://redhat.com')
            self.assertEqual(resp.status_code, 500)
            self.assertEqual(resp.text, 'kekekeke')

            self.assertEqual(len(handler.requests), 1)
            request = handler.requests[0]
            self.assertEqual(request.url, 'https://redhat.com/')

    def test_unauthorized(self):
        with self.mock_response("Unauthorized", status_code=403):
            with self.assertRaises(PrometheusApiClientException) as exc:
                self.pc.all_metrics()
        self.assertEqual("HTTP Status Code 403 (b'Unauthorized')",
                         str(exc.exception))

    def test_broken_responses(self):
        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.all_metrics()
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.get_current_metric_value("metric")
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.get_metric_range_data("metric")
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.custom_query_range("query", datetime.now(), datetime.now(),
                                       "1")
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.custom_query("query")
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

    def test_all_metrics_method(self):
        all_metrics_payload = {"status": "success", "data": ["up", "alerts"]}

        with self.mock_response(all_metrics_payload) as handler:
            self.assertTrue(len(self.pc.all_metrics()))
            self.assertEqual(handler.call_count, 1)
            request = handler.requests[0]
            self.assertEqual(request.path_url, "/api/v1/label/__name__/values")
コード例 #5
0
ファイル: data_dump.py プロジェクト: CoderKevinZhang/AI-SIG
def get_all_metrics(start_time='5m', end_time='now', instance='', gpu_id=''):
    """
    all DCGM metrics, on all instances, and all gpus
    save dumped data to csv file
    """
    # save the time first, in case multiple query at different time later
    start_time = parse_datetime(start_time)
    end_time = parse_datetime(end_time)
    # connect to premtheus server, exit if connection fails
    url = "http://prometheus:9090"  # use service name, instead of ip to be more robust
    prom = PrometheusConnect(url=url, disable_ssl=True)
    try:
        prom.check_prometheus_connection()
    except Exception as e:
        logging.error(e)
        exit(1)
    # get all metrics under profiler job, note: some instances/gpus may not have all the metrics due to model variance
    metrics = prom.all_metrics()
    metrics = [a for a in metrics if 'DCGM' in a]
    gpu_util = 'DCGM_FI_DEV_GPU_UTIL'
    label_cfg = {"job": "profiler-pods"}
    # get a screenshot of all the instances (pod ip)
    metric_data = prom.get_current_metric_value(metric_name=gpu_util,
                                                label_config=label_cfg)
    metric_df = MetricSnapshotDataFrame(metric_data)
    instances = metric_df.instance.unique()
    ins_gpu = dict()
    for ins in instances:
        # add instance in query
        label_cfg['instance'] = ins
        metric_data = prom.get_current_metric_value(metric_name=gpu_util,
                                                    label_config=label_cfg)
        metric_df = MetricSnapshotDataFrame(metric_data)
        gpus = metric_df.gpu.unique()
        # put each instance's gpus into dictionary
        ins_gpu[ins] = gpus

    my_label_config = {"job": "profiler-pods", "gpu": gpu_id}  # select gpu0
    #my_label_config = {"instance": instance}  # select all gpu
    # if one particular instance is given, update instances
    if instance != '':
        instances = [
            instance,
        ]
    for ins in instances:
        if gpu_id != '':
            gpus = [
                gpu_id,
            ]
        else:
            gpus = ins_gpu[ins]
            print(ins, gpus)
        for gpu in gpus:
            my_label_config = {"instance": ins, "gpu": gpu}
            df = pd.DataFrame()
            for metric_name in metrics:
                # select from different metric_name to query
                metric_data = prom.get_metric_range_data(
                    metric_name=metric_name,
                    label_config=my_label_config,
                    start_time=parse_datetime(start_time),
                    end_time=parse_datetime(end_time))

                # reorganize data to label_config and metric_values
                metric_object_list = MetricsList(metric_data)
                if len(metric_object_list) > 0:
                    if 'datetime' not in df.columns:
                        df['datetime'] = metric_object_list[0].metric_values[
                            'ds']
                    df[metric_name] = metric_object_list[0].metric_values['y']

            file_name = "_".join([ins, gpu]) + ".csv"
            df.to_csv(file_name)
コード例 #6
0
class PrometheusInfraConnect:
    def __init__(self, region, verify_ssl=False):
        self.api = None
        self.region = region
        self.prometheus_infra = "https://prometheus-infra-collector." + self.region + ".cloud.sap"

        self.login()

    def login(self):
        self.api = PrometheusConnect(url=self.prometheus_infra,
                                     disable_ssl=False,
                                     retry=None)
        self.api._session.cert = ('/etc/secret-volume/client_cert',
                                  '/etc/secret-volume/client_key')

    def find_vm_readiness(self, vcenter, vm):

        avail_zone = self.region + vcenter.split("-", 2)[1]
        vm_label_config = {"datacenter": avail_zone, "virtualmachine": vm}
        try:
            vm_cpu_ready_ratio = float(
                self.api.get_current_metric_value(
                    metric_name='vrops_virtualmachine_cpu_ready_ratio',
                    label_config=vm_label_config)[0]['value'][1])
            vm__memory_activewrite_kb = float(
                self.api.get_current_metric_value(
                    metric_name=
                    'vrops_virtualmachine_memory_activewrite_kilobytes',
                    label_config=vm_label_config)[0]['value'][1])
            if vm_cpu_ready_ratio < 1 and vm__memory_activewrite_kb < 21000000:
                return "vm_readiness"
            log.info(
                "- INFO - vm name %s has cpu_ready_ratio %s and memory_activewrite_kb %s ",
                str(vm), str(vm_cpu_ready_ratio),
                str(vm__memory_activewrite_kb))
            return "no_vm_readiness"
        except Exception as e:
            log.warn("problems connecting vm %s in prometheus infra: %s",
                     str(vm), str(e))
            return "prom_issue"

    def find_host_contention(self, vcenter, host):

        avail_zone = self.region + vcenter.split("-", 2)[1]
        host_label_config = {"datacenter": avail_zone, "hostsystem": host}
        try:
            host_cpu_contention = float(
                self.api.get_current_metric_value(
                    metric_name='vrops_hostsystem_cpu_contention_percentage',
                    label_config=host_label_config)[0]['value'][1])
            host_memory_contention = float(
                self.api.get_current_metric_value(
                    metric_name='vrops_hostsystem_memory_contention_percentage',
                    label_config=host_label_config)[0]['value'][1])
            if host_cpu_contention < 3 and host_memory_contention == 0:
                return "host_contention"
            log.info(
                "- INFO - host name %s has host_cpu_contention %s and  host_memory_contention %s",
                host, host_cpu_contention, host_memory_contention)
            return "no_host_contention"
        except Exception as e:
            log.warn("problems connecting host %s in prometheus infra: %s",
                     str(host), str(e))
            return "prom_issue"