def train_individual_model(predictor_model, initial_run): metric_to_predict = predictor_model.metric pc = PrometheusConnect( url=Configuration.prometheus_url, headers=Configuration.prom_connect_headers, disable_ssl=True, ) data_start_time = datetime.now() - Configuration.metric_chunk_size if initial_run: data_start_time = ( datetime.now() - Configuration.rolling_training_window_size ) # Download new metric data from prometheus new_metric_data = pc.get_metric_range_data( metric_name=metric_to_predict.metric_name, label_config=metric_to_predict.label_config, start_time=data_start_time, end_time=datetime.now(), )[0] # Train the new model start_time = datetime.now() predictor_model.train( new_metric_data, Configuration.retraining_interval_minutes) _LOGGER.info( "Total Training time taken = %s, for metric: %s %s", str(datetime.now() - start_time), metric_to_predict.metric_name, metric_to_predict.label_config, ) return predictor_model
def __init__(self, action): self.sample_info_dict = action self.uuid = action["uuid"] self.user = action["user"] self.cluster_name = action["cluster_name"] self.test_config = action["test_config"] # change datetime in seconds string to datetime object starttime = datetime.fromtimestamp(int(self.sample_info_dict["starttime"])) self.start = starttime # change datetime in seconds string to datetime object endtime = datetime.fromtimestamp(int(self.sample_info_dict["endtime"])) self.end = endtime # step value to be used in prometheus query # default is 30 seconds(openshift default scraping interval) # but can be overridden with env if "prom_step" in os.environ: self.T_Delta = os.environ["prom_step"] else: self.T_Delta = 30 self.get_data = False if "prom_token" in os.environ and "prom_url" in os.environ: self.get_data = True token = os.environ["prom_token"] self.url = os.environ["prom_url"] bearer = "Bearer " + token self.headers = {'Authorization': bearer} self.pc = PrometheusConnect(url=self.url, headers=self.headers, disable_ssl=True) else: logger.warn("""snafu service account token and prometheus url not set \n No Prometheus data will be indexed""")
def get_gpu_number(self): max_available_gpu = 0 pod_list = [] ## Verify if dcgm-exporter is deployed try: pod_list = self.api_client.list_pod_for_all_namespaces(label_selector="app=nvidia-dcgm-exporter") except ApiException as e: if e.status != 404: _LOGGER.error("Exception when calling DCGM exporter pods: %s\n" % e) if len(pod_list.items) != 0: prom = PrometheusConnect( url=self.get_prometheus_url(), headers={"Authorization": "Bearer " + self.get_openshift_prometheus_token()}, disable_ssl=True) for pod in pod_list.items: pod_IP = pod.status.pod_ip gpu_query = 'count (count by (UUID,GPU_I_ID) (DCGM_FI_PROF_GR_ENGINE_ACTIVE{instance="' + pod_IP +\ ':9400"}) or vector(0)) - count(count by (UUID,GPU_I_ID) (DCGM_FI_PROF_GR_ENGINE_ACTIVE{instance="'\ + pod_IP + ':9400", exported_pod=~".+"}) or vector(0))' get_available_gpu_in_node_data = prom.custom_query(query=gpu_query) get_available_gpu_in_node = int(get_available_gpu_in_node_data[0]['value'][1]) if get_available_gpu_in_node > max_available_gpu: max_available_gpu = get_available_gpu_in_node return max_available_gpu
def collect_metrics(): """Collect metrics from Prometheus/Thanos.""" pc = PrometheusConnect( url=_THANOS_URL, headers={"Authorization": f"bearer {_THANOS_TOKEN}"}, disable_ssl=True) collected_info = {} for sli_name, sli_methods in SLIReport.REPORT_SLI_CONTEXT.items(): _LOGGER.info(f"Retrieving data for... {sli_name}") collected_info[sli_name] = {} for query_name, query in sli_methods["query"].items(): _LOGGER.info(f"Querying... {query_name}") try: metric_data = pc.custom_query(query=query) _LOGGER.info(f"Metric obtained... {metric_data}") collected_info[sli_name][query_name] = float( metric_data[0]["value"][1]) except Exception as e: _LOGGER.exception( f"Could not gather metric for {sli_name}-{query_name}...{e}" ) pass collected_info[sli_name][query_name] = None return collected_info
def main(): try: # Setting up Mongo DB MONGO_HOST = str(os.environ.get('MONGO_HOST', '127.0.0.1')) MONGO_PORT = str(os.environ.get('MONGO_PORT', '27017')) MONGO_DB = str(os.environ.get('MONGO_DBNAME', 'cpa')) MONGO_USER = str(os.environ.get('MONGO_USERNAME', 'root')) MONGO_PASS = str(os.environ.get('MONGO_PASSWORD', 'iRhrF6O0vp')) mongodb_client = MongoClient( 'mongodb://{}:{}@{}:{}/?authSource=admin'.format( MONGO_USER, MONGO_PASS, MONGO_HOST, MONGO_PORT)) cpa_db = mongodb_client[MONGO_DB] deployments_collection = cpa_db.deployments list_of_deployments = [] for deployment in deployments_collection.find(): list_of_deployments = deployment['list'] # Setting up Prometheus prometheus_base = str( os.environ.get('PROMETHEUS_URL', 'http://192.168.23.92:9090')) prom = PrometheusConnect(url=prometheus_base, disable_ssl=True) # get workload cpu query_workload_cpu = """ sum( irate(container_cpu_usage_seconds_total{cluster="", namespace="default"}[2m]) * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster="", namespace="default", workload_type="deployment"} ) by (workload, workload_type) """ get_workload_cpu_query = lambda: prom.custom_query(query= query_workload_cpu) def get_deployments_cpu_usage(list_of_deployments): wl_cpu_res = get_workload_cpu_query() # filter results (unit is millicores) filtered_cpu_query = { q['metric']['workload']: float(q['value'][1]) * 1000 for q in wl_cpu_res if q['metric']['workload'] in list_of_deployments } # if metric skipped, put in None instead for d in list_of_deployments: if d not in filtered_cpu_query: filtered_cpu_query[d] = None return filtered_cpu_query deployments_cpu = get_deployments_cpu_usage(list_of_deployments) # Parse spec into a dict # spec = json.loads(r'{"resource": {"kind": "Deployment", "apiVersion": "apps/v1", "metadata": {"name": "redis-cart", "namespace": "default", "uid": "1b25ec34-965e-4f57-9638-b95e78edfe41", "resourceVersion": "2238", "generation": 1, "creationTimestamp": "2021-02-13T06:18:09Z", "annotations": {"deployment.kubernetes.io/revision": "1", "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"apps/v1\",\"kind\":\"Deployment\",\"metadata\":{\"annotations\":{},\"name\":\"redis-cart\",\"namespace\":\"default\"},\"spec\":{\"selector\":{\"matchLabels\":{\"app\":\"redis-cart\"}},\"template\":{\"metadata\":{\"labels\":{\"app\":\"redis-cart\"}},\"spec\":{\"containers\":[{\"image\":\"redis:alpine\",\"livenessProbe\":{\"periodSeconds\":5,\"tcpSocket\":{\"port\":6379}},\"name\":\"redis\",\"ports\":[{\"containerPort\":6379}],\"readinessProbe\":{\"periodSeconds\":5,\"tcpSocket\":{\"port\":6379}},\"resources\":{\"limits\":{\"cpu\":\"125m\",\"memory\":\"256Mi\"},\"requests\":{\"cpu\":\"70m\",\"memory\":\"200Mi\"}},\"volumeMounts\":[{\"mountPath\":\"/data\",\"name\":\"redis-data\"}]}],\"volumes\":[{\"emptyDir\":{},\"name\":\"redis-data\"}]}}}}\n"}, "managedFields": [{"manager": "kubectl", "operation": "Update", "apiVersion": "apps/v1", "time": "2021-02-13T06:18:09Z", "fieldsType": "FieldsV1", "fieldsV1": {"f:metadata": {"f:annotations": {".": {}, "f:kubectl.kubernetes.io/last-applied-configuration": {}}}, "f:spec": {"f:progressDeadlineSeconds": {}, "f:replicas": {}, "f:revisionHistoryLimit": {}, "f:selector": {}, "f:strategy": {"f:rollingUpdate": {".": {}, "f:maxSurge": {}, "f:maxUnavailable": {}}, "f:type": {}}, "f:template": {"f:metadata": {"f:labels": {".": {}, "f:app": {}}}, "f:spec": {"f:containers": {"k:{\"name\":\"redis\"}": {".": {}, "f:image": {}, "f:imagePullPolicy": {}, "f:livenessProbe": {".": {}, "f:failureThreshold": {}, "f:periodSeconds": {}, "f:successThreshold": {}, "f:tcpSocket": {".": {}, "f:port": {}}, "f:timeoutSeconds": {}}, "f:name": {}, "f:ports": {".": {}, "k:{\"containerPort\":6379,\"protocol\":\"TCP\"}": {".": {}, "f:containerPort": {}, "f:protocol": {}}}, "f:readinessProbe": {".": {}, "f:failureThreshold": {}, "f:periodSeconds": {}, "f:successThreshold": {}, "f:tcpSocket": {".": {}, "f:port": {}}, "f:timeoutSeconds": {}}, "f:resources": {".": {}, "f:limits": {".": {}, "f:cpu": {}, "f:memory": {}}, "f:requests": {".": {}, "f:cpu": {}, "f:memory": {}}}, "f:terminationMessagePath": {}, "f:terminationMessagePolicy": {}, "f:volumeMounts": {".": {}, "k:{\"mountPath\":\"/data\"}": {".": {}, "f:mountPath": {}, "f:name": {}}}}}, "f:dnsPolicy": {}, "f:restartPolicy": {}, "f:schedulerName": {}, "f:securityContext": {}, "f:terminationGracePeriodSeconds": {}, "f:volumes": {".": {}, "k:{\"name\":\"redis-data\"}": {".": {}, "f:emptyDir": {}, "f:name": {}}}}}}}}, {"manager": "k3s", "operation": "Update", "apiVersion": "apps/v1", "time": "2021-02-13T06:18:21Z", "fieldsType": "FieldsV1", "fieldsV1": {"f:metadata": {"f:annotations": {"f:deployment.kubernetes.io/revision": {}}}, "f:status": {"f:availableReplicas": {}, "f:conditions": {".": {}, "k:{\"type\":\"Available\"}": {".": {}, "f:lastTransitionTime": {}, "f:lastUpdateTime": {}, "f:message": {}, "f:reason": {}, "f:status": {}, "f:type": {}}, "k:{\"type\":\"Progressing\"}": {".": {}, "f:lastTransitionTime": {}, "f:lastUpdateTime": {}, "f:message": {}, "f:reason": {}, "f:status": {}, "f:type": {}}}, "f:observedGeneration": {}, "f:readyReplicas": {}, "f:replicas": {}, "f:updatedReplicas": {}}}}]}, "spec": {"replicas": 1, "selector": {"matchLabels": {"app": "redis-cart"}}, "template": {"metadata": {"creationTimestamp": null, "labels": {"app": "redis-cart"}}, "spec": {"volumes": [{"name": "redis-data", "emptyDir": {}}], "containers": [{"name": "redis", "image": "redis:alpine", "ports": [{"containerPort": 6379, "protocol": "TCP"}], "resources": {"limits": {"cpu": "125m", "memory": "256Mi"}, "requests": {"cpu": "70m", "memory": "200Mi"}}, "volumeMounts": [{"name": "redis-data", "mountPath": "/data"}], "livenessProbe": {"tcpSocket": {"port": 6379}, "timeoutSeconds": 1, "periodSeconds": 5, "successThreshold": 1, "failureThreshold": 3}, "readinessProbe": {"tcpSocket": {"port": 6379}, "timeoutSeconds": 1, "periodSeconds": 5, "successThreshold": 1, "failureThreshold": 3}, "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent"}], "restartPolicy": "Always", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "securityContext": {}, "schedulerName": "default-scheduler"}}, "strategy": {"type": "RollingUpdate", "rollingUpdate": {"maxUnavailable": "25%", "maxSurge": "25%"}}, "revisionHistoryLimit": 10, "progressDeadlineSeconds": 600}, "status": {"observedGeneration": 1, "replicas": 1, "updatedReplicas": 1, "readyReplicas": 1, "availableReplicas": 1, "conditions": [{"type": "Available", "status": "True", "lastUpdateTime": "2021-02-13T06:18:21Z", "lastTransitionTime": "2021-02-13T06:18:21Z", "reason": "MinimumReplicasAvailable", "message": "Deployment has minimum availability."}, {"type": "Progressing", "status": "True", "lastUpdateTime": "2021-02-13T06:18:21Z", "lastTransitionTime": "2021-02-13T06:18:09Z", "reason": "NewReplicaSetAvailable", "message": "ReplicaSet \"redis-cart-74594bd569\" has successfully progressed."}]}}, "runType": "scaler"}') spec = json.loads(sys.stdin.read()) metric(spec, list_of_deployments, deployments_cpu) except Exception as err: sys.stderr.write(f"Error metric: {err}") exit(1)
def test_retry_on_error(self): # noqa D102 retry = Retry(total=3, backoff_factor=0.1, status_forcelist=[400]) pc = PrometheusConnect(url=self.prometheus_host, disable_ssl=True, retry=retry) with self.assertRaises(requests.exceptions.RetryError, msg="too many 400 error responses"): pc.custom_query("BOOM.BOOM!#$%")
def __init__(self, host, port, disablessl): if disablessl == True: self.schema = "http" else: self.schema = "https" try: self.prom = PrometheusConnect(url=self.schema + "://" + host + ":" + port, disable_ssl=disablessl) except Exception: print("Fehler")
def pro(): pc = PrometheusConnect( url="https://prometheus-k8s-openshift-monitoring.apps-crc.testing", headers={ "Authorization": "bearer BSI2W0euoJWYRAvT0ZnSJVmgNQ87pl3o3yXuyy38qAg" }, disable_ssl=True) up_metric = MetricsList( pc.get_current_metric_value( metric_name="haproxy_backend_up{exported_namespace='prophet'}")) print(up_metric[0])
def get_current(self) -> float: prom = PrometheusConnect( url=self.config.get("url", "http://localhost:9090"), disable_ssl=self.config.get("disable_ssl", True), ) res = prom.custom_query(query=self.query) if not res: log.error("Prometheus query: no result") raise Exception("Prometheus query: no result") log.info(f"Prometheus query result: {res}") return float(res[0].get("value")[-1])
def timed_job(): config = configparser.ConfigParser() config.read('config/config.cfg') account = config.get('DEFAULT', 'ACCOUNT') key = config.get('DEFAULT', 'KEY') promi = config.get('DEFAULT', 'PROM') promup = promi.encode() container = config.get('DEFAULT', 'CONTAINER') url = config.get('DEFAULT', 'URL') blob_service = BlockBlobService(account_name=account, account_key=key) userAndPass = b64encode(promup).decode("ascii") headers = {'Authorization': 'Basic %s' % userAndPass} prom = PrometheusConnect(url=url, headers=headers, disable_ssl=False) metric_data = prom.all_metrics() time = datetime.now() metrics = [] values = [] for i in metric_data: metric = prom.get_metric_range_data(metric_name=i, start_time=time - timedelta(hours=1), end_time=time, chunk_size=timedelta(hours=1)) x = int(0) for d in metric: for name, dct in d.items(): dct = dict(dct) if name == 'metric': dct['id'] = x metrics.append(dct) else: for key in dct: va = {} va['time'] = key va['value'] = dct[key] va['id'] = x values.append(va) x = x + 1 df = pd.DataFrame(metrics) df1 = pd.DataFrame(values) df = pd.merge(df, df1, how='inner', left_on=['id'], right_on=['id']) df['time'] = pd.to_datetime(df['time'], unit='s') df = df.drop(['endpoint', 'service', 'id'], axis=1) write_pandas_dataframe_to_blob( blob_service, df, container, str((datetime.now()).date()) + '/' + str(datetime.now().time()).replace(':', '').replace(".", ''))
def __init__(self, docker_client_services_path, docker_server_services_path, ingress_distribution_file_path, docker_lb_container_path, service_list): self.docker_client_services = get_docker_services( docker_client_services_path) self.docker_server_services = get_docker_services( docker_server_services_path) self.get_ingress_distribution = get_docker_services( ingress_distribution_file_path) self.docker_lb_services = get_docker_services(docker_lb_container_path) self.prom = PrometheusConnect(url="http://131.155.35.54:9090", disable_ssl=True) self.capture_time = CAPTURE_TIME self.service_list = service_list pass
class PromSummarizer(object): def __init__(self, url, disable_ssl=False): self.prom = PrometheusConnect(url=url, disable_ssl=disable_ssl) def fetch(self, expression, number_of_days): start_time = parse_datetime('%dd' % number_of_days) end_time = parse_datetime('now') chunk_size = parse_timedelta('now', '1d') metric_data = self.prom.get_metric_range_data( expression, start_time=start_time, end_time=end_time, chunk_size=chunk_size, ) # MetricsList combines the chunks into a single metric metric = MetricsList(metric_data)[0] # Yield tuples of timestamp, value for value in metric.metric_values.values: ts, val = value.tolist() # The timestamp is delivered in UTC, convert to local ts = ts.to_pydatetime().replace(tzinfo=tz.tzutc()) ts = ts.astimezone(tz.tzlocal()) yield ts, val
class Configuration: """Configuration of metrics-exporter.""" # Prometheus URL = os.environ["PROMETHEUS_HOST_URL"] PROMETHEUS_SERVICE_ACCOUNT_TOKEN = os.environ[ "PROMETHEUS_SERVICE_ACCOUNT_TOKEN"] HEADERS = {"Authorization": f"bearer {PROMETHEUS_SERVICE_ACCOUNT_TOKEN}"} PROM = PrometheusConnect(url=URL, disable_ssl=True, headers=HEADERS) # Namespaces THOTH_BACKEND_NAMESPACE = os.environ["THOTH_BACKEND_NAMESPACE"] THOTH_MIDDLETIER_NAMESPACE = os.environ["THOTH_MIDDLETIER_NAMESPACE"] THOTH_AMUN_INSPECTION_NAMESPACE = os.environ[ "THOTH_AMUN_INSPECTION_NAMESPACE"] # Ceph CEPH_ACCESS_KEY_ID = os.environ["THOTH_CEPH_KEY_ID"] CEPH_ACCESS_SECRET_KEY = os.environ["THOTH_CEPH_SECRET_KEY"] CEPH_BUCKET_PREFIX = os.environ["THOTH_CEPH_BUCKET_PREFIX"] S3_ENDPOINT_URL = os.environ["THOTH_S3_ENDPOINT_URL"] CEPH_BUCKET = os.environ["THOTH_CEPH_BUCKET"] DEPLOYMENT_NAME = os.environ["THOTH_DEPLOYMENT_NAME"] # Kebechet GITHUB_ACCESS_TOKEN = os.environ["GITHUB_ACCESS_TOKEN"]
def client(self): verify = os.getenv('APIALCHEMY_PROMETHEUS_SSL_VERIFY', 'true').lower() == 'true' if not verify: urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) return PrometheusConnect(**self._conn_params, disable_ssl=not verify)
def profiling(url, pod_ip, ana_window='2m', metrics=MEM_UTIL): """if key exists, the value will be replaced, add dynamic status {ai.centaurus.io/gpu0:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:1}, ai.centaurus.io/gpu1:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:2, processes:[{pid:25678, cur_mem_used:3GB},{pid:67234, cur_mem_used:1GB}]} } """ ret_dict = dict() promi = PrometheusConnect(url=url, disable_ssl=True) # except connection error try: promi.check_prometheus_connection() except Exception as e: logging.error(e) return ret_dict # if connectioin fails, return empty dict instance = pod_ip + ":9400" # tmp fixed start_time = parse_datetime(ana_window) end_time = parse_datetime("now") my_label_config = {"instance": instance} # select current host metrics metric_data = promi.get_metric_range_data(metric_name=metrics, label_config=my_label_config, start_time=start_time, end_time=end_time) # reorganize data to label_config and metric_values metric_object_list = MetricsList(metric_data) ret_dict = dict() for item in metric_object_list: # iterate through all the gpus on the node if 'gpu' not in item.label_config: # handle metric config info exception continue id = item.label_config['gpu'] # predefined key from dcgm (gpu index) # ip = item.label_config['instance'] key = DOMAIN + "/gpu-" + id cur_usage = collect_cur_usage(int(id)) ts = item.metric_values.iloc[:, 1] # metrics_values are two row df, 1st is timestamp, 2nd is value cur_usage['cyclic_pattern'] = False if ts.max() > 0: cyclic, period = cyclic_pattern_detection(ts) if cyclic: cur_usage['cyclic_pattern'] = True cur_usage['period'] = str(period) cur_usage['max_mem_util'] = str(ts.max()) # Important: flatten nested dictionary to string, otherwise error "cannot unmarshal string into Go value of type map[string]interface {}"" ret_dict[key] = str(cur_usage) return ret_dict
def update_saved_prom_metrics(metrics, save_dir): # connect to prometheus prom_url = os.getenv("FLT_PROM_URL", "https://telemeter-lts.datahub.redhat.com/") prom_access_token = os.getenv("FLT_PROM_ACCESS_TOKEN") pc = PrometheusConnect( url=prom_url, headers={"Authorization": f"bearer {prom_access_token}"}, disable_ssl=True, ) # get metrics if avaiable if "cluster_operator_conditions" in metrics: conditions_df = metric_preprocessors.opconds_metrics_to_df( metrics_raw=pc.get_current_metric_value( "cluster_operator_conditions")) if "cluster_installer" in metrics: install_df = metric_preprocessors.installer_metrics_to_df( metrics_raw=pc.get_current_metric_value("cluster_installer")) if "cluster_version" in metrics: versions_df = metric_preprocessors.version_metrics_to_df( metrics_raw=pc.get_current_metric_value("cluster_version")) # combine all metrics metrics_df = conditions_df.merge(install_df, how="left", left_index=True, right_index=True) metrics_df = metrics_df.merge(versions_df, how="left", left_index=True, right_index=True) # nans because some install types are neither upi nor ipi (unknown) metrics_df["install_type_IPI"] = metrics_df["install_type_IPI"].fillna(0) metrics_df["install_type_UPI"] = metrics_df["install_type_UPI"].fillna(0) # save to volume metrics_df.to_parquet( fname=os.path.join(save_dir, "metrics.parquet"), engine="pyarrow", index=True, )
class PrometheusClient: def __init__(self, promhost, promport): self.prom = PrometheusConnect(url="http://%s:%s" % (promhost, promport), disable_ssl=True) def get_ticktime(self): return self.__get_metric_for_last_five_mins("overall_ticktime")[0].get( "values") def get_dim_ticktime(self): result = {} dim_ticktimes = self.__get_metric_for_last_five_mins("dim_ticktime") for dimension in dim_ticktimes: result[dimension.get("metric").get( "dimension_name")] = dimension.get("values") return result def get_players(self): players = [] for p in self.prom.custom_query("player_playtime"): players.append(p.get("metric").get("player")) return players def get_tps(self): return self.__get_metric_for_last_five_mins("overall_tps")[0].get( "values") def get_dim_tps(self): result = {} dim_tps = self.__get_metric_for_last_five_mins("dim_tps") for dimension in dim_tps: result[dimension.get("metric").get( "dimension_name")] = dimension.get("values") return result def __get_metric_for_last_five_mins(self, metricname): return self.prom.get_metric_range_data( metric_name=metricname, start_time=datetime.datetime.now() - datetime.timedelta(minutes=5), end_time=datetime.datetime.now(), )
def queryMetrics(customquery, trim): # print("\n queryMetrics START\n") prom = PrometheusConnect(url ="http://localhost:9090", disable_ssl=True) data = prom.custom_query(query=customquery, ) # To make it a table where each row is a metric df = MetricSnapshotDataFrame(data) df = df[df.value != "NaN"] df[['value']] = df[['value']].apply(pd.to_numeric) df[['timestamp']] = df[['timestamp']].apply(pd.to_datetime, unit='s') sortedDf = df.sort_values('value', ascending=False).head(trim) # print(nicenumbers) # print(df.index) # print(df.columns) # print("\n queryMetrics END\n") return sortedDf
def launch_prometheus(): if kube_env.check_kubernetes_status() != util.EXIT_SUCCESS: log.error("Kubernetes is not set up." " Did you run the deployment script?") sys.exit(util.EXIT_FAILURE) cmd = "kubectl get pods -n istio-system -lapp=prometheus " cmd += " -o jsonpath={.items[0].metadata.name}" prom_pod_name = util.get_output_from_proc(cmd).decode("utf-8") cmd = f"kubectl port-forward -n istio-system {prom_pod_name} 9090" prom_proc = util.start_process(cmd, preexec_fn=os.setsid) time.sleep(2) prom_api = PrometheusConnect(url="http://localhost:9090", disable_ssl=True) return prom_proc, prom_api
class my_prometheus(): def __init__(self, host, port, disablessl): if disablessl == True: self.schema = "http" else: self.schema = "https" try: self.prom = PrometheusConnect(url=self.schema + "://" + host + ":" + port, disable_ssl=disablessl) except Exception: print("Fehler") def prom_query(self, query): self.lasttemps = self.prom.custom_query(query=query) self.lasttemp = sorted(self.lasttemps[0]["values"], reverse=True)[0][1]
def check_database_metrics_availability(configuration: Configuration) -> bool: """Check database metrics (Prometheus/Thanos) availability.""" pc = PrometheusConnect( url=configuration.thanos_url, headers={"Authorization": f"bearer {configuration.thanos_token}"}, disable_ssl=True, ) response = pc._session.get( "{0}/".format(pc.url), verify=pc.ssl_verification, headers=pc.headers, params={}, ) if not response.ok: return False return True
def build_reports(timestamp, config, es_url, thanos_url, grafana_url, target_index): es_client = Elasticsearch(es_url) thanos_client = PrometheusConnect(thanos_url, disable_ssl=True) clusters, docs = collect.get_clusters(es_client, timestamp, indices=config['searchIndices']) reports = [] for cluster in clusters: benchmarks = collect.get_benchmarks_for_cluster(cluster['cluster_name'], docs, config['ignoreTags']) for benchmark in benchmarks: report = { **cluster, 'report_type': 'podLatency', 'metadata': benchmark['metadata'], "results": collect.get_benchmark_results(benchmark, es_client) } if report['results'] != {}: print(f"cluster {report['cluster_name']} has results") reports.append(report) for report in enrich.enrich_reports(reports, grafana_url, thanos_client, config): response = index.index_report(es_client, report, target_index) print(response)
import tornado.ioloop import tornado.web from tornado.httpserver import HTTPServer from prometheus_client import Gauge, generate_latest, REGISTRY from prometheus_api_client import PrometheusConnect, Metric from configuration import Configuration from graph_handler import GraphHandler import schedule _LOGGER = logging.getLogger(__name__) PREDICTOR_MODEL_LIST = list() pc = PrometheusConnect( url=Configuration.prometheus_url, headers=Configuration.prometheus_headers, disable_ssl=True, ) for metric in Configuration.metrics_list: metric_init = pc.get_current_metric_value(metric_name=metric) for unique_metric in metric_init: PREDICTOR_MODEL_LIST.append( Configuration.algorithm( unique_metric, rolling_data_window_size=Configuration. rolling_training_window_size, )) GAUGE_DICT = dict()
class TestPrometheusConnect(unittest.TestCase): """ Test module for class PrometheusConnect """ def setUp(self): """ set up connection settings for prometheus """ self.prometheus_host = os.getenv("PROM_URL") self.pc = PrometheusConnect(url=self.prometheus_host, disable_ssl=True) def test_metrics_list(self): """ Check if setup was done correctly """ metrics_list = self.pc.all_metrics() self.assertTrue( len(metrics_list) > 0, "no metrics received from prometheus") def test_get_metric_range_data(self): start_time = datetime.now() - timedelta(minutes=10) end_time = datetime.now() metric_data = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time=end_time) metric_objects_list = MetricsList(metric_data) self.assertTrue( len(metric_objects_list) > 0, "no metrics received from prometheus") self.assertTrue( start_time.timestamp() < metric_objects_list[0].start_time.timestamp(), "invalid metric start time", ) self.assertTrue( (start_time + timedelta(minutes=1)).timestamp() > metric_objects_list[0].start_time.timestamp(), "invalid metric start time", ) self.assertTrue( end_time.timestamp() > metric_objects_list[0].end_time.timestamp(), "invalid metric end time", ) self.assertTrue( (end_time - timedelta(minutes=1)).timestamp() < metric_objects_list[0].end_time.timestamp(), "invalid metric end time", ) def test_get_metric_range_data_with_chunk_size(self): start_time = datetime.now() - timedelta(minutes=65) chunk_size = timedelta(minutes=7) end_time = datetime.now() - timedelta(minutes=5) metric_data = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time=end_time, chunk_size=chunk_size) metric_objects_list = MetricsList(metric_data) self.assertTrue( len(metric_objects_list) > 0, "no metrics received from prometheus") self.assertTrue( start_time.timestamp() < metric_objects_list[0].start_time.timestamp(), "invalid metric start time (with given chunk_size)", ) self.assertTrue( (start_time + timedelta(minutes=1)).timestamp() > metric_objects_list[0].start_time.timestamp(), "invalid metric start time (with given chunk_size)", ) self.assertTrue( end_time.timestamp() > metric_objects_list[0].end_time.timestamp(), "invalid metric end time (with given chunk_size)", ) self.assertTrue( (end_time - timedelta(minutes=1)).timestamp() < metric_objects_list[0].end_time.timestamp(), "invalid metric end time (with given chunk_size)", ) def test_get_metric_range_data_with_incorrect_input_types(self): start_time = datetime.now() - timedelta(minutes=20) chunk_size = timedelta(minutes=7) end_time = datetime.now() - timedelta(minutes=10) with self.assertRaises(TypeError, msg="start_time accepted invalid value type"): _ = self.pc.get_metric_range_data(metric_name="up", start_time="20m", end_time=end_time, chunk_size=chunk_size) with self.assertRaises(TypeError, msg="end_time accepted invalid value type"): _ = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time="10m", chunk_size=chunk_size) with self.assertRaises(TypeError, msg="chunk_size accepted invalid value type"): _ = self.pc.get_metric_range_data(metric_name="up", start_time=start_time, end_time=end_time, chunk_size="10m")
def setUp(self): """ set up connection settings for prometheus """ self.prometheus_host = os.getenv("PROM_URL") self.pc = PrometheusConnect(url=self.prometheus_host, disable_ssl=True)
def setUp(self): self.pc = PrometheusConnect(url='http://doesnt_matter.xyz', disable_ssl=True)
class TestPrometheusConnectWithMockedNetwork(BaseMockedNetworkTestcase): """ Network is blocked in this testcase, see base class """ def setUp(self): self.pc = PrometheusConnect(url='http://doesnt_matter.xyz', disable_ssl=True) def test_network_is_blocked(self): resp = requests.get('https://google.com') self.assertEqual(resp.status_code, 403) self.assertEqual(resp.text, 'BOOM!') def test_how_mock_prop_works(self): with self.mock_response('kekekeke', status_code=500) as handler: self.assertEqual(len(handler.requests), 0) resp = requests.get('https://redhat.com') self.assertEqual(resp.status_code, 500) self.assertEqual(resp.text, 'kekekeke') self.assertEqual(len(handler.requests), 1) request = handler.requests[0] self.assertEqual(request.url, 'https://redhat.com/') def test_unauthorized(self): with self.mock_response("Unauthorized", status_code=403): with self.assertRaises(PrometheusApiClientException) as exc: self.pc.all_metrics() self.assertEqual("HTTP Status Code 403 (b'Unauthorized')", str(exc.exception)) def test_broken_responses(self): with self.assertRaises(PrometheusApiClientException) as exc: self.pc.all_metrics() self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.get_current_metric_value("metric") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.get_metric_range_data("metric") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.custom_query_range("query", datetime.now(), datetime.now(), "1") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) with self.assertRaises(PrometheusApiClientException) as exc: self.pc.custom_query("query") self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception)) def test_all_metrics_method(self): all_metrics_payload = {"status": "success", "data": ["up", "alerts"]} with self.mock_response(all_metrics_payload) as handler: self.assertTrue(len(self.pc.all_metrics())) self.assertEqual(handler.call_count, 1) request = handler.requests[0] self.assertEqual(request.path_url, "/api/v1/label/__name__/values")
def __init__(self, nodes: NodeDataView): super().__init__(nodes) self._prom = PrometheusConnect(url=settings.prometheus.url)
class PrometheusMetricProvider(MetricProvider): def __init__(self, nodes: NodeDataView): super().__init__(nodes) self._prom = PrometheusConnect(url=settings.prometheus.url) def get_metric(self, metric: PrometheusMetric) -> List: try: return self.__prom_request(metric.query) except PrometheusApiClientException as e: logger.error(f"Error pulling {metric}: {e}") return [] def __prom_request(self, query: str) -> List: return self._prom.custom_query(query) def refresh_data(self): logger.debug("Pulling metrics from Prometheus") self._data["messages_in"] = self.__get_messages_in() self._data["messages_out"] = self.__get_messages_out() self._data["consumer_lag"] = self.__get_consumer_lag() self._data["consumer_read_rate"] = self.__get_consumer_read_rate() self._data["topic_size"] = self.__get_topic_size() self._data["replicas"] = self.__get_replicas() self._data["connector_tasks"] = self.__get_connector_tasks() def __get_messages_in(self) -> Dict[str, float]: prom_messages_in = self.get_metric(metric=PrometheusMetric.MESSAGES_IN) return { d["metric"]["topic"]: round(float(d["value"][-1]), 2) for d in prom_messages_in } def __get_messages_out(self) -> Dict[str, float]: prom_messages_out = self.get_metric( metric=PrometheusMetric.MESSAGES_OUT) return { d["metric"]["topic"]: round(float(d["value"][-1]), 2) for d in prom_messages_out } def __get_consumer_lag(self) -> Dict[str, int]: prom_consumer_lag = self.get_metric( metric=PrometheusMetric.CONSUMER_LAG) return { d["metric"]["group"]: int(d["value"][-1]) for d in prom_consumer_lag } def __get_consumer_read_rate(self) -> Dict[str, float]: prom_consumer_read_rate = self.get_metric( metric=PrometheusMetric.CONSUMER_READ_RATE) return { d["metric"]["group"]: float(d["value"][-1]) for d in prom_consumer_read_rate } def __get_topic_size(self) -> Dict[str, int]: prom_topic_size = self.get_metric(metric=PrometheusMetric.TOPIC_SIZE) return { d["metric"]["topic"]: int(d["value"][-1]) for d in prom_topic_size } def __get_replicas(self) -> Dict[str, int]: prom_replicas = self.get_metric(metric=PrometheusMetric.REPLICAS) return { d["metric"]["deployment"]: int(d["value"][-1]) for d in prom_replicas } def __get_connector_tasks(self) -> Dict[str, int]: prom_connector_tasks = self.get_metric( metric=PrometheusMetric.CONNECTOR_TASKS) return { d["metric"]["connector"]: int(d["value"][-1]) for d in prom_connector_tasks }
class get_prometheus_data: def __init__(self, action): self.sample_info_dict = action self.uuid = action["uuid"] self.user = action["user"] self.cluster_name = action["cluster_name"] self.test_config = action["test_config"] # change datetime in seconds string to datetime object starttime = datetime.fromtimestamp( int(self.sample_info_dict["starttime"])) self.start = starttime # change datetime in seconds string to datetime object endtime = datetime.fromtimestamp(int(self.sample_info_dict["endtime"])) self.end = endtime # step value to be used in prometheus query # default is 30 seconds(openshift default scraping interval) # but can be overridden with env if "prom_step" in os.environ: self.T_Delta = os.environ["prom_step"] else: self.T_Delta = 30 self.get_data = False if "prom_token" in os.environ and "prom_url" in os.environ: self.get_data = True token = os.environ["prom_token"] self.url = os.environ["prom_url"] bearer = "Bearer " + token self.headers = {"Authorization": bearer} self.pc = PrometheusConnect(url=self.url, headers=self.headers, disable_ssl=True) else: logger.warn( """snafu service account token and prometheus url not set \n No Prometheus data will be indexed""") def get_all_metrics(self): # check get_data bool, if false by-pass all processing if self.get_data: start_time = time.time() # resolve directory the tool include file dirname = os.path.dirname(os.path.realpath(__file__)) include_file_dir = os.path.join(dirname, "prometheus_labels/") tool_include_file = include_file_dir + self.sample_info_dict[ "tool"] + "_included_labels.json" # check if tools include file is there # if not use the default include file if os.path.isfile(tool_include_file): filename = tool_include_file else: filename = os.path.join(include_file_dir, "included_labels.json") logger.info("using prometheus metric include file %s" % filename) # open tools include file and loop through all with open(filename, "r") as f: datastore = json.load(f) for metric_name in datastore["data"]: query_item = datastore["data"][metric_name] query = query_item["query"] label = query_item["label"] step = str(self.T_Delta) + "s" try: # Execute custom query to pull the desired labels between X and Y time. response = self.pc.custom_query_range( query, self.start, self.end, step, None) except Exception as e: # response undefined at this point, we want to skip next for loop response = [] logger.info(query) logger.warn("failure to get metric results %s" % e) for result in response: # clean up name key from __name__ to name result["metric"]["name"] = "" if "__name__" in result["metric"]: result["metric"]["name"] = result["metric"]["__name__"] del result["metric"]["__name__"] else: result["metric"]["name"] = label # each result has a list, we must flatten it out in order to send to ES for value in result["values"]: # fist index is time stamp timestamp = datetime.utcfromtimestamp( value[0]).strftime("%Y-%m-%dT%H:%M:%S.%fZ") # second index is value of metric if "NaN" in value[ 1]: # need to handle values that are NaN, Inf, or -Inf metric_value = 0 else: metric_value = float(value[1]) flat_doc = { "metric": result["metric"], "Date": timestamp, "value": metric_value, "metric_name": metric_name, } flat_doc.update(self.sample_info_dict) yield flat_doc logger.debug("Total Time --- %s seconds ---" % (time.time() - start_time))