def update_saved_prom_metrics(metrics, save_dir): # connect to prometheus prom_url = os.getenv("FLT_PROM_URL", "https://telemeter-lts.datahub.redhat.com/") prom_access_token = os.getenv("FLT_PROM_ACCESS_TOKEN") pc = PrometheusConnect( url=prom_url, headers={"Authorization": f"bearer {prom_access_token}"}, disable_ssl=True, ) # get metrics if avaiable if "cluster_operator_conditions" in metrics: conditions_df = metric_preprocessors.opconds_metrics_to_df( metrics_raw=pc.get_current_metric_value( "cluster_operator_conditions")) if "cluster_installer" in metrics: install_df = metric_preprocessors.installer_metrics_to_df( metrics_raw=pc.get_current_metric_value("cluster_installer")) if "cluster_version" in metrics: versions_df = metric_preprocessors.version_metrics_to_df( metrics_raw=pc.get_current_metric_value("cluster_version")) # combine all metrics metrics_df = conditions_df.merge(install_df, how="left", left_index=True, right_index=True) metrics_df = metrics_df.merge(versions_df, how="left", left_index=True, right_index=True) # nans because some install types are neither upi nor ipi (unknown) metrics_df["install_type_IPI"] = metrics_df["install_type_IPI"].fillna(0) metrics_df["install_type_UPI"] = metrics_df["install_type_UPI"].fillna(0) # save to volume metrics_df.to_parquet( fname=os.path.join(save_dir, "metrics.parquet"), engine="pyarrow", index=True, )
def pro(): pc = PrometheusConnect( url="https://prometheus-k8s-openshift-monitoring.apps-crc.testing", headers={ "Authorization": "bearer BSI2W0euoJWYRAvT0ZnSJVmgNQ87pl3o3yXuyy38qAg" }, disable_ssl=True) up_metric = MetricsList( pc.get_current_metric_value( metric_name="haproxy_backend_up{exported_namespace='prophet'}")) print(up_metric[0])
from configuration import Configuration from graph_handler import GraphHandler import schedule _LOGGER = logging.getLogger(__name__) PREDICTOR_MODEL_LIST = list() pc = PrometheusConnect( url=Configuration.prometheus_url, headers=Configuration.prometheus_headers, disable_ssl=True, ) for metric in Configuration.metrics_list: metric_init = pc.get_current_metric_value(metric_name=metric) for unique_metric in metric_init: PREDICTOR_MODEL_LIST.append( Configuration.algorithm( unique_metric, rolling_data_window_size=Configuration. rolling_training_window_size, )) GAUGE_DICT = dict() for predictor in PREDICTOR_MODEL_LIST: unique_metric = predictor.metric label_list = list(unique_metric.label_config.keys()) label_list.append("value_type") if unique_metric.metric_name not in GAUGE_DICT:
class TestPrometheusConnectWithMockedNetwork(BaseMockedNetworkTestcase): """ Network is blocked in this testcase, see base class """ def setUp(self): self.pc = PrometheusConnect(url='http://doesnt_matter.xyz', disable_ssl=True) def test_network_is_blocked(self): resp = requests.get('https://google.com') self.assertEqual(resp.status_code, 403) self.assertEqual(resp.text, 'BOOM!') def test_how_mock_prop_works(self): with self.mock_response('kekekeke', status_code=500) as handler: self.assertEqual(len(handler.requests), 0) resp = requests.get('https://redhat.com') self.assertEqual(resp.status_code, 500) self.assertEqual(resp.text, 'kekekeke') self.assertEqual(len(handler.requests), 1) request = handler.requests[0] self.assertEqual(request.url, 'https://redhat.com/') def test_unauthorized(self): with self.mock_response("Unauthorized", status_code=403): with self.assertRaises(PrometheusApiClientException) as exc: self.pc.all_metrics() self.assertEqual("HTTP Status Code 403 (b'Unauthorized')", str(exc.exception)) def test_broken_responses(self): with self.assertRaises(PrometheusApiClientException) as exc: self.pc.all_metrics() self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.get_current_metric_value("metric") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.get_metric_range_data("metric") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.custom_query_range("query", datetime.now(), datetime.now(), "1") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.custom_query("query") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) def test_all_metrics_method(self): all_metrics_payload = {"status": "success", "data": ["up", "alerts"]} with self.mock_response(all_metrics_payload) as handler: self.assertTrue(len(self.pc.all_metrics())) self.assertEqual(handler.call_count, 1) request = handler.requests[0] self.assertEqual(request.path_url, "/api/v1/label/__name__/values")
def get_all_metrics(start_time='5m', end_time='now', instance='', gpu_id=''): """ all DCGM metrics, on all instances, and all gpus save dumped data to csv file """ # save the time first, in case multiple query at different time later start_time = parse_datetime(start_time) end_time = parse_datetime(end_time) # connect to premtheus server, exit if connection fails url = "http://prometheus:9090" # use service name, instead of ip to be more robust prom = PrometheusConnect(url=url, disable_ssl=True) try: prom.check_prometheus_connection() except Exception as e: logging.error(e) exit(1) # get all metrics under profiler job, note: some instances/gpus may not have all the metrics due to model variance metrics = prom.all_metrics() metrics = [a for a in metrics if 'DCGM' in a] gpu_util = 'DCGM_FI_DEV_GPU_UTIL' label_cfg = {"job": "profiler-pods"} # get a screenshot of all the instances (pod ip) metric_data = prom.get_current_metric_value(metric_name=gpu_util, label_config=label_cfg) metric_df = MetricSnapshotDataFrame(metric_data) instances = metric_df.instance.unique() ins_gpu = dict() for ins in instances: # add instance in query label_cfg['instance'] = ins metric_data = prom.get_current_metric_value(metric_name=gpu_util, label_config=label_cfg) metric_df = MetricSnapshotDataFrame(metric_data) gpus = metric_df.gpu.unique() # put each instance's gpus into dictionary ins_gpu[ins] = gpus my_label_config = {"job": "profiler-pods", "gpu": gpu_id} # select gpu0 #my_label_config = {"instance": instance} # select all gpu # if one particular instance is given, update instances if instance != '': instances = [ instance, ] for ins in instances: if gpu_id != '': gpus = [ gpu_id, ] else: gpus = ins_gpu[ins] print(ins, gpus) for gpu in gpus: my_label_config = {"instance": ins, "gpu": gpu} df = pd.DataFrame() for metric_name in metrics: # select from different metric_name to query metric_data = prom.get_metric_range_data( metric_name=metric_name, label_config=my_label_config, start_time=parse_datetime(start_time), end_time=parse_datetime(end_time)) # reorganize data to label_config and metric_values metric_object_list = MetricsList(metric_data) if len(metric_object_list) > 0: if 'datetime' not in df.columns: df['datetime'] = metric_object_list[0].metric_values[ 'ds'] df[metric_name] = metric_object_list[0].metric_values['y'] file_name = "_".join([ins, gpu]) + ".csv" df.to_csv(file_name)
class PrometheusInfraConnect: def __init__(self, region, verify_ssl=False): self.api = None self.region = region self.prometheus_infra = "https://prometheus-infra-collector." + self.region + ".cloud.sap" self.login() def login(self): self.api = PrometheusConnect(url=self.prometheus_infra, disable_ssl=False, retry=None) self.api._session.cert = ('/etc/secret-volume/client_cert', '/etc/secret-volume/client_key') def find_vm_readiness(self, vcenter, vm): avail_zone = self.region + vcenter.split("-", 2)[1] vm_label_config = {"datacenter": avail_zone, "virtualmachine": vm} try: vm_cpu_ready_ratio = float( self.api.get_current_metric_value( metric_name='vrops_virtualmachine_cpu_ready_ratio', label_config=vm_label_config)[0]['value'][1]) vm__memory_activewrite_kb = float( self.api.get_current_metric_value( metric_name= 'vrops_virtualmachine_memory_activewrite_kilobytes', label_config=vm_label_config)[0]['value'][1]) if vm_cpu_ready_ratio < 1 and vm__memory_activewrite_kb < 21000000: return "vm_readiness" log.info( "- INFO - vm name %s has cpu_ready_ratio %s and memory_activewrite_kb %s ", str(vm), str(vm_cpu_ready_ratio), str(vm__memory_activewrite_kb)) return "no_vm_readiness" except Exception as e: log.warn("problems connecting vm %s in prometheus infra: %s", str(vm), str(e)) return "prom_issue" def find_host_contention(self, vcenter, host): avail_zone = self.region + vcenter.split("-", 2)[1] host_label_config = {"datacenter": avail_zone, "hostsystem": host} try: host_cpu_contention = float( self.api.get_current_metric_value( metric_name='vrops_hostsystem_cpu_contention_percentage', label_config=host_label_config)[0]['value'][1]) host_memory_contention = float( self.api.get_current_metric_value( metric_name='vrops_hostsystem_memory_contention_percentage', label_config=host_label_config)[0]['value'][1]) if host_cpu_contention < 3 and host_memory_contention == 0: return "host_contention" log.info( "- INFO - host name %s has host_cpu_contention %s and host_memory_contention %s", host, host_cpu_contention, host_memory_contention) return "no_host_contention" except Exception as e: log.warn("problems connecting host %s in prometheus infra: %s", str(host), str(e)) return "prom_issue"