def get_gpu_number(self): max_available_gpu = 0 pod_list = [] ## Verify if dcgm-exporter is deployed try: pod_list = self.api_client.list_pod_for_all_namespaces(label_selector="app=nvidia-dcgm-exporter") except ApiException as e: if e.status != 404: _LOGGER.error("Exception when calling DCGM exporter pods: %s\n" % e) if len(pod_list.items) != 0: prom = PrometheusConnect( url=self.get_prometheus_url(), headers={"Authorization": "Bearer " + self.get_openshift_prometheus_token()}, disable_ssl=True) for pod in pod_list.items: pod_IP = pod.status.pod_ip gpu_query = 'count (count by (UUID,GPU_I_ID) (DCGM_FI_PROF_GR_ENGINE_ACTIVE{instance="' + pod_IP +\ ':9400"}) or vector(0)) - count(count by (UUID,GPU_I_ID) (DCGM_FI_PROF_GR_ENGINE_ACTIVE{instance="'\ + pod_IP + ':9400", exported_pod=~".+"}) or vector(0))' get_available_gpu_in_node_data = prom.custom_query(query=gpu_query) get_available_gpu_in_node = int(get_available_gpu_in_node_data[0]['value'][1]) if get_available_gpu_in_node > max_available_gpu: max_available_gpu = get_available_gpu_in_node return max_available_gpu
def collect_metrics(): """Collect metrics from Prometheus/Thanos.""" pc = PrometheusConnect( url=_THANOS_URL, headers={"Authorization": f"bearer {_THANOS_TOKEN}"}, disable_ssl=True) collected_info = {} for sli_name, sli_methods in SLIReport.REPORT_SLI_CONTEXT.items(): _LOGGER.info(f"Retrieving data for... {sli_name}") collected_info[sli_name] = {} for query_name, query in sli_methods["query"].items(): _LOGGER.info(f"Querying... {query_name}") try: metric_data = pc.custom_query(query=query) _LOGGER.info(f"Metric obtained... {metric_data}") collected_info[sli_name][query_name] = float( metric_data[0]["value"][1]) except Exception as e: _LOGGER.exception( f"Could not gather metric for {sli_name}-{query_name}...{e}" ) pass collected_info[sli_name][query_name] = None return collected_info
def __init__(self, action): self.sample_info_dict = action self.uuid = action["uuid"] self.user = action["user"] self.cluster_name = action["cluster_name"] self.test_config = action["test_config"] # change datetime in seconds string to datetime object starttime = datetime.fromtimestamp(int(self.sample_info_dict["starttime"])) self.start = starttime # change datetime in seconds string to datetime object endtime = datetime.fromtimestamp(int(self.sample_info_dict["endtime"])) self.end = endtime # step value to be used in prometheus query # default is 30 seconds(openshift default scraping interval) # but can be overridden with env if "prom_step" in os.environ: self.T_Delta = os.environ["prom_step"] else: self.T_Delta = 30 self.get_data = False if "prom_token" in os.environ and "prom_url" in os.environ: self.get_data = True token = os.environ["prom_token"] self.url = os.environ["prom_url"] bearer = "Bearer " + token self.headers = {'Authorization': bearer} self.pc = PrometheusConnect(url=self.url, headers=self.headers, disable_ssl=True) else: logger.warn("""snafu service account token and prometheus url not set \n No Prometheus data will be indexed""")
class Configuration: """Configuration of metrics-exporter.""" # Prometheus URL = os.environ["PROMETHEUS_HOST_URL"] PROMETHEUS_SERVICE_ACCOUNT_TOKEN = os.environ[ "PROMETHEUS_SERVICE_ACCOUNT_TOKEN"] HEADERS = {"Authorization": f"bearer {PROMETHEUS_SERVICE_ACCOUNT_TOKEN}"} PROM = PrometheusConnect(url=URL, disable_ssl=True, headers=HEADERS) # Namespaces THOTH_BACKEND_NAMESPACE = os.environ["THOTH_BACKEND_NAMESPACE"] THOTH_MIDDLETIER_NAMESPACE = os.environ["THOTH_MIDDLETIER_NAMESPACE"] THOTH_AMUN_INSPECTION_NAMESPACE = os.environ[ "THOTH_AMUN_INSPECTION_NAMESPACE"] # Ceph CEPH_ACCESS_KEY_ID = os.environ["THOTH_CEPH_KEY_ID"] CEPH_ACCESS_SECRET_KEY = os.environ["THOTH_CEPH_SECRET_KEY"] CEPH_BUCKET_PREFIX = os.environ["THOTH_CEPH_BUCKET_PREFIX"] S3_ENDPOINT_URL = os.environ["THOTH_S3_ENDPOINT_URL"] CEPH_BUCKET = os.environ["THOTH_CEPH_BUCKET"] DEPLOYMENT_NAME = os.environ["THOTH_DEPLOYMENT_NAME"] # Kebechet GITHUB_ACCESS_TOKEN = os.environ["GITHUB_ACCESS_TOKEN"]
def train_individual_model(predictor_model, initial_run): metric_to_predict = predictor_model.metric pc = PrometheusConnect( url=Configuration.prometheus_url, headers=Configuration.prom_connect_headers, disable_ssl=True, ) data_start_time = datetime.now() - Configuration.metric_chunk_size if initial_run: data_start_time = ( datetime.now() - Configuration.rolling_training_window_size ) # Download new metric data from prometheus new_metric_data = pc.get_metric_range_data( metric_name=metric_to_predict.metric_name, label_config=metric_to_predict.label_config, start_time=data_start_time, end_time=datetime.now(), )[0] # Train the new model start_time = datetime.now() predictor_model.train( new_metric_data, Configuration.retraining_interval_minutes) _LOGGER.info( "Total Training time taken = %s, for metric: %s %s", str(datetime.now() - start_time), metric_to_predict.metric_name, metric_to_predict.label_config, ) return predictor_model
def main(): try: # Setting up Mongo DB MONGO_HOST = str(os.environ.get('MONGO_HOST', '127.0.0.1')) MONGO_PORT = str(os.environ.get('MONGO_PORT', '27017')) MONGO_DB = str(os.environ.get('MONGO_DBNAME', 'cpa')) MONGO_USER = str(os.environ.get('MONGO_USERNAME', 'root')) MONGO_PASS = str(os.environ.get('MONGO_PASSWORD', 'iRhrF6O0vp')) mongodb_client = MongoClient( 'mongodb://{}:{}@{}:{}/?authSource=admin'.format( MONGO_USER, MONGO_PASS, MONGO_HOST, MONGO_PORT)) cpa_db = mongodb_client[MONGO_DB] deployments_collection = cpa_db.deployments list_of_deployments = [] for deployment in deployments_collection.find(): list_of_deployments = deployment['list'] # Setting up Prometheus prometheus_base = str( os.environ.get('PROMETHEUS_URL', 'http://192.168.23.92:9090')) prom = PrometheusConnect(url=prometheus_base, disable_ssl=True) # get workload cpu query_workload_cpu = """ sum( irate(container_cpu_usage_seconds_total{cluster="", namespace="default"}[2m]) * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster="", namespace="default", workload_type="deployment"} ) by (workload, workload_type) """ get_workload_cpu_query = lambda: prom.custom_query(query= query_workload_cpu) def get_deployments_cpu_usage(list_of_deployments): wl_cpu_res = get_workload_cpu_query() # filter results (unit is millicores) filtered_cpu_query = { q['metric']['workload']: float(q['value'][1]) * 1000 for q in wl_cpu_res if q['metric']['workload'] in list_of_deployments } # if metric skipped, put in None instead for d in list_of_deployments: if d not in filtered_cpu_query: filtered_cpu_query[d] = None return filtered_cpu_query deployments_cpu = get_deployments_cpu_usage(list_of_deployments) # Parse spec into a dict # spec = json.loads(r'{"resource": {"kind": "Deployment", "apiVersion": "apps/v1", "metadata": {"name": "redis-cart", "namespace": "default", "uid": "1b25ec34-965e-4f57-9638-b95e78edfe41", "resourceVersion": "2238", "generation": 1, "creationTimestamp": "2021-02-13T06:18:09Z", "annotations": {"deployment.kubernetes.io/revision": "1", "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"apps/v1\",\"kind\":\"Deployment\",\"metadata\":{\"annotations\":{},\"name\":\"redis-cart\",\"namespace\":\"default\"},\"spec\":{\"selector\":{\"matchLabels\":{\"app\":\"redis-cart\"}},\"template\":{\"metadata\":{\"labels\":{\"app\":\"redis-cart\"}},\"spec\":{\"containers\":[{\"image\":\"redis:alpine\",\"livenessProbe\":{\"periodSeconds\":5,\"tcpSocket\":{\"port\":6379}},\"name\":\"redis\",\"ports\":[{\"containerPort\":6379}],\"readinessProbe\":{\"periodSeconds\":5,\"tcpSocket\":{\"port\":6379}},\"resources\":{\"limits\":{\"cpu\":\"125m\",\"memory\":\"256Mi\"},\"requests\":{\"cpu\":\"70m\",\"memory\":\"200Mi\"}},\"volumeMounts\":[{\"mountPath\":\"/data\",\"name\":\"redis-data\"}]}],\"volumes\":[{\"emptyDir\":{},\"name\":\"redis-data\"}]}}}}\n"}, "managedFields": [{"manager": "kubectl", "operation": "Update", "apiVersion": "apps/v1", "time": "2021-02-13T06:18:09Z", "fieldsType": "FieldsV1", "fieldsV1": {"f:metadata": {"f:annotations": {".": {}, "f:kubectl.kubernetes.io/last-applied-configuration": {}}}, "f:spec": {"f:progressDeadlineSeconds": {}, "f:replicas": {}, "f:revisionHistoryLimit": {}, "f:selector": {}, "f:strategy": {"f:rollingUpdate": {".": {}, "f:maxSurge": {}, "f:maxUnavailable": {}}, "f:type": {}}, "f:template": {"f:metadata": {"f:labels": {".": {}, "f:app": {}}}, "f:spec": {"f:containers": {"k:{\"name\":\"redis\"}": {".": {}, "f:image": {}, "f:imagePullPolicy": {}, "f:livenessProbe": {".": {}, "f:failureThreshold": {}, "f:periodSeconds": {}, "f:successThreshold": {}, "f:tcpSocket": {".": {}, "f:port": {}}, "f:timeoutSeconds": {}}, "f:name": {}, "f:ports": {".": {}, "k:{\"containerPort\":6379,\"protocol\":\"TCP\"}": {".": {}, "f:containerPort": {}, "f:protocol": {}}}, "f:readinessProbe": {".": {}, "f:failureThreshold": {}, "f:periodSeconds": {}, "f:successThreshold": {}, "f:tcpSocket": {".": {}, "f:port": {}}, "f:timeoutSeconds": {}}, "f:resources": {".": {}, "f:limits": {".": {}, "f:cpu": {}, "f:memory": {}}, "f:requests": {".": {}, "f:cpu": {}, "f:memory": {}}}, "f:terminationMessagePath": {}, "f:terminationMessagePolicy": {}, "f:volumeMounts": {".": {}, "k:{\"mountPath\":\"/data\"}": {".": {}, "f:mountPath": {}, "f:name": {}}}}}, "f:dnsPolicy": {}, "f:restartPolicy": {}, "f:schedulerName": {}, "f:securityContext": {}, "f:terminationGracePeriodSeconds": {}, "f:volumes": {".": {}, "k:{\"name\":\"redis-data\"}": {".": {}, "f:emptyDir": {}, "f:name": {}}}}}}}}, {"manager": "k3s", "operation": "Update", "apiVersion": "apps/v1", "time": "2021-02-13T06:18:21Z", "fieldsType": "FieldsV1", "fieldsV1": {"f:metadata": {"f:annotations": {"f:deployment.kubernetes.io/revision": {}}}, "f:status": {"f:availableReplicas": {}, "f:conditions": {".": {}, "k:{\"type\":\"Available\"}": {".": {}, "f:lastTransitionTime": {}, "f:lastUpdateTime": {}, "f:message": {}, "f:reason": {}, "f:status": {}, "f:type": {}}, "k:{\"type\":\"Progressing\"}": {".": {}, "f:lastTransitionTime": {}, "f:lastUpdateTime": {}, "f:message": {}, "f:reason": {}, "f:status": {}, "f:type": {}}}, "f:observedGeneration": {}, "f:readyReplicas": {}, "f:replicas": {}, "f:updatedReplicas": {}}}}]}, "spec": {"replicas": 1, "selector": {"matchLabels": {"app": "redis-cart"}}, "template": {"metadata": {"creationTimestamp": null, "labels": {"app": "redis-cart"}}, "spec": {"volumes": [{"name": "redis-data", "emptyDir": {}}], "containers": [{"name": "redis", "image": "redis:alpine", "ports": [{"containerPort": 6379, "protocol": "TCP"}], "resources": {"limits": {"cpu": "125m", "memory": "256Mi"}, "requests": {"cpu": "70m", "memory": "200Mi"}}, "volumeMounts": [{"name": "redis-data", "mountPath": "/data"}], "livenessProbe": {"tcpSocket": {"port": 6379}, "timeoutSeconds": 1, "periodSeconds": 5, "successThreshold": 1, "failureThreshold": 3}, "readinessProbe": {"tcpSocket": {"port": 6379}, "timeoutSeconds": 1, "periodSeconds": 5, "successThreshold": 1, "failureThreshold": 3}, "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent"}], "restartPolicy": "Always", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "securityContext": {}, "schedulerName": "default-scheduler"}}, "strategy": {"type": "RollingUpdate", "rollingUpdate": {"maxUnavailable": "25%", "maxSurge": "25%"}}, "revisionHistoryLimit": 10, "progressDeadlineSeconds": 600}, "status": {"observedGeneration": 1, "replicas": 1, "updatedReplicas": 1, "readyReplicas": 1, "availableReplicas": 1, "conditions": [{"type": "Available", "status": "True", "lastUpdateTime": "2021-02-13T06:18:21Z", "lastTransitionTime": "2021-02-13T06:18:21Z", "reason": "MinimumReplicasAvailable", "message": "Deployment has minimum availability."}, {"type": "Progressing", "status": "True", "lastUpdateTime": "2021-02-13T06:18:21Z", "lastTransitionTime": "2021-02-13T06:18:09Z", "reason": "NewReplicaSetAvailable", "message": "ReplicaSet \"redis-cart-74594bd569\" has successfully progressed."}]}}, "runType": "scaler"}') spec = json.loads(sys.stdin.read()) metric(spec, list_of_deployments, deployments_cpu) except Exception as err: sys.stderr.write(f"Error metric: {err}") exit(1)
def client(self): verify = os.getenv('APIALCHEMY_PROMETHEUS_SSL_VERIFY', 'true').lower() == 'true' if not verify: urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) return PrometheusConnect(**self._conn_params, disable_ssl=not verify)
def test_retry_on_error(self): # noqa D102 retry = Retry(total=3, backoff_factor=0.1, status_forcelist=[400]) pc = PrometheusConnect(url=self.prometheus_host, disable_ssl=True, retry=retry) with self.assertRaises(requests.exceptions.RetryError, msg="too many 400 error responses"): pc.custom_query("BOOM.BOOM!#$%")
def __init__(self, host, port, disablessl): if disablessl == True: self.schema = "http" else: self.schema = "https" try: self.prom = PrometheusConnect(url=self.schema + "://" + host + ":" + port, disable_ssl=disablessl) except Exception: print("Fehler")
def pro(): pc = PrometheusConnect( url="https://prometheus-k8s-openshift-monitoring.apps-crc.testing", headers={ "Authorization": "bearer BSI2W0euoJWYRAvT0ZnSJVmgNQ87pl3o3yXuyy38qAg" }, disable_ssl=True) up_metric = MetricsList( pc.get_current_metric_value( metric_name="haproxy_backend_up{exported_namespace='prophet'}")) print(up_metric[0])
def get_current(self) -> float: prom = PrometheusConnect( url=self.config.get("url", "http://localhost:9090"), disable_ssl=self.config.get("disable_ssl", True), ) res = prom.custom_query(query=self.query) if not res: log.error("Prometheus query: no result") raise Exception("Prometheus query: no result") log.info(f"Prometheus query result: {res}") return float(res[0].get("value")[-1])
def timed_job(): config = configparser.ConfigParser() config.read('config/config.cfg') account = config.get('DEFAULT', 'ACCOUNT') key = config.get('DEFAULT', 'KEY') promi = config.get('DEFAULT', 'PROM') promup = promi.encode() container = config.get('DEFAULT', 'CONTAINER') url = config.get('DEFAULT', 'URL') blob_service = BlockBlobService(account_name=account, account_key=key) userAndPass = b64encode(promup).decode("ascii") headers = {'Authorization': 'Basic %s' % userAndPass} prom = PrometheusConnect(url=url, headers=headers, disable_ssl=False) metric_data = prom.all_metrics() time = datetime.now() metrics = [] values = [] for i in metric_data: metric = prom.get_metric_range_data(metric_name=i, start_time=time - timedelta(hours=1), end_time=time, chunk_size=timedelta(hours=1)) x = int(0) for d in metric: for name, dct in d.items(): dct = dict(dct) if name == 'metric': dct['id'] = x metrics.append(dct) else: for key in dct: va = {} va['time'] = key va['value'] = dct[key] va['id'] = x values.append(va) x = x + 1 df = pd.DataFrame(metrics) df1 = pd.DataFrame(values) df = pd.merge(df, df1, how='inner', left_on=['id'], right_on=['id']) df['time'] = pd.to_datetime(df['time'], unit='s') df = df.drop(['endpoint', 'service', 'id'], axis=1) write_pandas_dataframe_to_blob( blob_service, df, container, str((datetime.now()).date()) + '/' + str(datetime.now().time()).replace(':', '').replace(".", ''))
def launch_prometheus(): if kube_env.check_kubernetes_status() != util.EXIT_SUCCESS: log.error("Kubernetes is not set up." " Did you run the deployment script?") sys.exit(util.EXIT_FAILURE) cmd = "kubectl get pods -n istio-system -lapp=prometheus " cmd += " -o jsonpath={.items[0].metadata.name}" prom_pod_name = util.get_output_from_proc(cmd).decode("utf-8") cmd = f"kubectl port-forward -n istio-system {prom_pod_name} 9090" prom_proc = util.start_process(cmd, preexec_fn=os.setsid) time.sleep(2) prom_api = PrometheusConnect(url="http://localhost:9090", disable_ssl=True) return prom_proc, prom_api
def __init__(self, docker_client_services_path, docker_server_services_path, ingress_distribution_file_path, docker_lb_container_path, service_list): self.docker_client_services = get_docker_services( docker_client_services_path) self.docker_server_services = get_docker_services( docker_server_services_path) self.get_ingress_distribution = get_docker_services( ingress_distribution_file_path) self.docker_lb_services = get_docker_services(docker_lb_container_path) self.prom = PrometheusConnect(url="http://131.155.35.54:9090", disable_ssl=True) self.capture_time = CAPTURE_TIME self.service_list = service_list pass
def check_database_metrics_availability(configuration: Configuration) -> bool: """Check database metrics (Prometheus/Thanos) availability.""" pc = PrometheusConnect( url=configuration.thanos_url, headers={"Authorization": f"bearer {configuration.thanos_token}"}, disable_ssl=True, ) response = pc._session.get( "{0}/".format(pc.url), verify=pc.ssl_verification, headers=pc.headers, params={}, ) if not response.ok: return False return True
def profiling(url, pod_ip, ana_window='2m', metrics=MEM_UTIL): """if key exists, the value will be replaced, add dynamic status {ai.centaurus.io/gpu0:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:1}, ai.centaurus.io/gpu1:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:2, processes:[{pid:25678, cur_mem_used:3GB},{pid:67234, cur_mem_used:1GB}]} } """ ret_dict = dict() promi = PrometheusConnect(url=url, disable_ssl=True) # except connection error try: promi.check_prometheus_connection() except Exception as e: logging.error(e) return ret_dict # if connectioin fails, return empty dict instance = pod_ip + ":9400" # tmp fixed start_time = parse_datetime(ana_window) end_time = parse_datetime("now") my_label_config = {"instance": instance} # select current host metrics metric_data = promi.get_metric_range_data(metric_name=metrics, label_config=my_label_config, start_time=start_time, end_time=end_time) # reorganize data to label_config and metric_values metric_object_list = MetricsList(metric_data) ret_dict = dict() for item in metric_object_list: # iterate through all the gpus on the node if 'gpu' not in item.label_config: # handle metric config info exception continue id = item.label_config['gpu'] # predefined key from dcgm (gpu index) # ip = item.label_config['instance'] key = DOMAIN + "/gpu-" + id cur_usage = collect_cur_usage(int(id)) ts = item.metric_values.iloc[:, 1] # metrics_values are two row df, 1st is timestamp, 2nd is value cur_usage['cyclic_pattern'] = False if ts.max() > 0: cyclic, period = cyclic_pattern_detection(ts) if cyclic: cur_usage['cyclic_pattern'] = True cur_usage['period'] = str(period) cur_usage['max_mem_util'] = str(ts.max()) # Important: flatten nested dictionary to string, otherwise error "cannot unmarshal string into Go value of type map[string]interface {}"" ret_dict[key] = str(cur_usage) return ret_dict
def update_saved_prom_metrics(metrics, save_dir): # connect to prometheus prom_url = os.getenv("FLT_PROM_URL", "https://telemeter-lts.datahub.redhat.com/") prom_access_token = os.getenv("FLT_PROM_ACCESS_TOKEN") pc = PrometheusConnect( url=prom_url, headers={"Authorization": f"bearer {prom_access_token}"}, disable_ssl=True, ) # get metrics if avaiable if "cluster_operator_conditions" in metrics: conditions_df = metric_preprocessors.opconds_metrics_to_df( metrics_raw=pc.get_current_metric_value( "cluster_operator_conditions")) if "cluster_installer" in metrics: install_df = metric_preprocessors.installer_metrics_to_df( metrics_raw=pc.get_current_metric_value("cluster_installer")) if "cluster_version" in metrics: versions_df = metric_preprocessors.version_metrics_to_df( metrics_raw=pc.get_current_metric_value("cluster_version")) # combine all metrics metrics_df = conditions_df.merge(install_df, how="left", left_index=True, right_index=True) metrics_df = metrics_df.merge(versions_df, how="left", left_index=True, right_index=True) # nans because some install types are neither upi nor ipi (unknown) metrics_df["install_type_IPI"] = metrics_df["install_type_IPI"].fillna(0) metrics_df["install_type_UPI"] = metrics_df["install_type_UPI"].fillna(0) # save to volume metrics_df.to_parquet( fname=os.path.join(save_dir, "metrics.parquet"), engine="pyarrow", index=True, )
def queryMetrics(customquery, trim): # print("\n queryMetrics START\n") prom = PrometheusConnect(url ="http://localhost:9090", disable_ssl=True) data = prom.custom_query(query=customquery, ) # To make it a table where each row is a metric df = MetricSnapshotDataFrame(data) df = df[df.value != "NaN"] df[['value']] = df[['value']].apply(pd.to_numeric) df[['timestamp']] = df[['timestamp']].apply(pd.to_datetime, unit='s') sortedDf = df.sort_values('value', ascending=False).head(trim) # print(nicenumbers) # print(df.index) # print(df.columns) # print("\n queryMetrics END\n") return sortedDf
def build_reports(timestamp, config, es_url, thanos_url, grafana_url, target_index): es_client = Elasticsearch(es_url) thanos_client = PrometheusConnect(thanos_url, disable_ssl=True) clusters, docs = collect.get_clusters(es_client, timestamp, indices=config['searchIndices']) reports = [] for cluster in clusters: benchmarks = collect.get_benchmarks_for_cluster(cluster['cluster_name'], docs, config['ignoreTags']) for benchmark in benchmarks: report = { **cluster, 'report_type': 'podLatency', 'metadata': benchmark['metadata'], "results": collect.get_benchmark_results(benchmark, es_client) } if report['results'] != {}: print(f"cluster {report['cluster_name']} has results") reports.append(report) for report in enrich.enrich_reports(reports, grafana_url, thanos_client, config): response = index.index_report(es_client, report, target_index) print(response)
import tornado.ioloop import tornado.web from tornado.httpserver import HTTPServer from prometheus_client import Gauge, generate_latest, REGISTRY from prometheus_api_client import PrometheusConnect, Metric from configuration import Configuration from graph_handler import GraphHandler import schedule _LOGGER = logging.getLogger(__name__) PREDICTOR_MODEL_LIST = list() pc = PrometheusConnect( url=Configuration.prometheus_url, headers=Configuration.prometheus_headers, disable_ssl=True, ) for metric in Configuration.metrics_list: metric_init = pc.get_current_metric_value(metric_name=metric) for unique_metric in metric_init: PREDICTOR_MODEL_LIST.append( Configuration.algorithm( unique_metric, rolling_data_window_size=Configuration. rolling_training_window_size, )) GAUGE_DICT = dict()
def setUp(self): """ set up connection settings for prometheus """ self.prometheus_host = os.getenv("PROM_URL") self.pc = PrometheusConnect(url=self.prometheus_host, disable_ssl=True)
def setUp(self): self.pc = PrometheusConnect(url='http://doesnt_matter.xyz', disable_ssl=True)
def __init__(self, nodes: NodeDataView): super().__init__(nodes) self._prom = PrometheusConnect(url=settings.prometheus.url)
def process_period(config, period): period_start = period['instant'] + dateutil.relativedelta.relativedelta( seconds=-period['range_sec']) print( f"Processing year {period['year']}, month {period['month']}, " f"querying from {period['instant'].isoformat()} and going back {period['range_sec']} s to {period_start.isoformat()}." ) queries = QueryLogic(queryRange=(str(period['range_sec']) + 's')) # SSL generally not used for Prometheus access within a cluster # Docs on instant query API: https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries prom = PrometheusConnect(url=config.prometheus_server, disable_ssl=True) prom_connect_params = { 'time': period['instant'].isoformat(), 'timeout': config.query_timeout } raw_results, results, result_lengths = {}, {}, [] # iterate over each query (cputime, starttime, endtime, cores) producing raw_results['cputime'] etc. for query_name, query_string in vars(queries).items(): # Each of these raw_results is a list of dicts. Each dict in the list represents an individual data point, and contains: # 'metric': a dict of one or more key-value pairs of labels, one of which is the pod name ('exported_pod'). # 'value': a list in which the 0th element is the timestamp of the value, and 1th element is the actual value we're interested in. print(f'Executing {query_name} query: {query_string}') t1 = timer() raw_results[query_name] = prom.custom_query(query=query_string, params=prom_connect_params) t2 = timer() results[query_name] = dict(rearrange(raw_results[query_name])) result_lengths.append(len(results[query_name])) t3 = timer() print( f'Query finished in {t2 - t1} s, processed in {t3 - t2} s. Got {len(results[query_name])} items from {len(raw_results[query_name])} results. Peak RAM usage: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss}K.' ) del raw_results[query_name] cputime = results['cputime'] endtime = results['endtime'] starttime = results['starttime'] cores = results['cores'] # Confirm the assumption that cputime should have the fewest entries, while starttime and cores may have additional ones # corresponding to jobs that have started but not finished yet, and endtime may have additional ones if there are pods without CPU resource requests. # We only want the jobs for which all values are available: start time, end time, CPU request. # Note that jobs which started last month and finished this month will be properly included and accounted in this month. assert len(cputime) == min( result_lengths), "cputime should be the shortest list" # However, jobs that finished last month may show up in this month's data if they are still present on the cluster this month (in Completed state). # Exclude them by filtering with a lambda (since you can't pass an argument to a function object AFAIK). endtime = dict( filter(lambda x: x[1] >= datetime.datetime.timestamp(period_start), endtime.items())) # Prepare to iterate over jobs which meet all criteria. valid_jobs = cputime.keys() & endtime.keys() # avoid sending empty records if len(valid_jobs) == 0: print('No records to process.') return sum_cputime = 0 t4 = timer() for key in valid_jobs: assert endtime[key] > starttime[ key], "job end time is before start time" # double check cputime calc of this job delta = abs(cputime[key] - (endtime[key] - starttime[key]) * cores[key]) assert delta < 0.001, "cputime calculation is inaccurate" sum_cputime += cputime[key] # CPU time as calculated here means (# cores * job duration), which apparently corresponds to # the concept of wall time in APEL accounting. It is not clear what CPU time means in APEL; # could be the actual CPU usage % integrated over the job (# cores * job duration * usage) # but this does not seem to be documented clearly. Some batch systems do not actually measure # this so it is not reported consistently or accurately. Some sites have CPU efficiency # (presumably defined as CPU time / wall time) time that is up to ~ 500% of the walltime, or # always fixed at 100%. In Kubernetes, the actual CPU usage % is tracked by metrics server # (not KSM), which is not meant to be used for monitoring or accounting purposes and is not # scraped by Prometheus. So just use walltime = cputime sum_cputime = round(sum_cputime) sum_walltime = sum_cputime print(f'total cputime: {sum_cputime}, total walltime: {sum_walltime}') # Write output to the message queue on local filesystem # https://dirq.readthedocs.io/en/latest/queuesimple.html#directory-structure dirq = QueueSimple(str(config.output_path)) summary_output = summary_message( config, year=period['year'], month=period['month'], wall_time=sum_walltime, cpu_time=sum_cputime, n_jobs=len(endtime), # this appears faster than getting min/max during the dict iteration above first_end=round(min(endtime.values())), last_end=round(max(endtime.values()))) sync_output = sync_message(config, year=period['year'], month=period['month'], n_jobs=len(endtime)) t5 = timer() summary_file = dirq.add(summary_output) sync_file = dirq.add(sync_output) print(f'Analyzed {len(endtime)} records in {t5 - t4} s.') print(f'Writing summary record to {config.output_path}/{summary_file}:') print('--------------------------------\n' + summary_output + '--------------------------------') print(f'Writing sync record to {config.output_path}/{sync_file}:') print('--------------------------------\n' + sync_output + '--------------------------------')
from util import get_interval_minutes from config_parser import getSettings from prometheus_api_client import PrometheusConnect from prometheus_api_client.utils import parse_timedelta from datetime import datetime from hashlib import md5 import json headers = None settings = getSettings('prometheus') if 'access_token' in settings: headers = {'Autorization': 'bearer ' + settings['access_token']} pc = PrometheusConnect( url=settings['url'], headers=headers, disable_ssl=True, ) def query_range(expr, time_range, resolution, fill_na): fill_method = { 'zeros': fill_na_zeros, 'default': None }[fill_na or 'default'] delta = parse_timedelta('now', time_range) start_time = datetime.now() - delta end_time = datetime.now() data = pc.custom_query_range(query=expr,
def setUp(self): # noqa D102 self.pc = PrometheusConnect(url="http://doesnt_matter.xyz", disable_ssl=True)
def __init__(self, url, disable_ssl=False): self.prom = PrometheusConnect(url=url, disable_ssl=disable_ssl)
def query_prom_data_range(svc_names, query_fn, start_time, end_time, sampling_rate=1, is_summary=False, url="http://vmhost1.local:9090"): """Query Prometheus metric data for customized services during customized time range. Params: svc_names: service metric names query_fn: function to construct the Prometheus query string from the service name. start_time: start time. A datetime.datetime object. end_time: same as start. A datetime.datetime object. sampling_rate: float, in seconds. is_summary: Boolean to represent whether the query is a summary with quantiles. Returns: all_metric_data: A dict of all metric data. Keys are service names. Values are dict containing timestamps and values (If is_summary is True, there are multiple timestamp and value items). """ def append_data(d, key, l): if key in d: d[key].append(l) else: d[key] = [l] prom = PrometheusConnect(url=url, disable_ssl=True) all_metric_data = {} for n in svc_names: query = query_fn(n) # Split into 3-hour batch and get one batch at a time. batch_len = datetime.timedelta(hours=3) batch_start = start_time batch_end = start_time + batch_len timestamps_dict = {} values_dict = {} metric_info = None while batch_start < end_time: if batch_end >= end_time: batch_end = end_time metric_data = prom.custom_query_range(query=query, start_time=batch_start, end_time=batch_end, step=sampling_rate) # Sometimes there are no metric data within the range. Skip processing. if len(metric_data) > 0: if metric_info is None: metric_info = {} metric_info['metric'] = metric_data[0]['metric'].copy() for one_data in metric_data: raw_values = np.array(one_data['values'], dtype=np.float64) # Retrive multiple time series data for different quantiles. if is_summary is True: # Remove quantile from metric info. metric_info['metric'].pop('quantile', None) key = 'q' + one_data['metric']['quantile'] else: # Only one time series key = 'data' append_data(timestamps_dict, key, raw_values[:, 0]) append_data(values_dict, key, raw_values[:, 1]) # Because the previous range [batch_start, batch_end] is inclusive at both ends. # We move to the next timestamp here. batch_start = batch_end + datetime.timedelta(seconds=sampling_rate) batch_end = batch_start + batch_len def concat(d, name, conv_type=np.float64): for k, v in d.items(): merged_v = np.concatenate(v).astype(conv_type) metric_info[f'{name}_{k}'] = merged_v concat(timestamps_dict, 'timestamps', conv_type=np.int64) concat(values_dict, 'values') all_metric_data[n] = metric_info return all_metric_data
import re import gspread from oauth2client.service_account import ServiceAccountCredentials scope = ["https://spreadsheets.google.com/feeds",'https://www.googleapis.com/auth/spreadsheets',"https://www.googleapis.com/auth/drive.file","https://www.googleapis.com/auth/drive"] creds = ServiceAccountCredentials.from_json_keyfile_name("../client_secret.json", scope) client = gspread.authorize(creds) sheet = client.open_by_key('1ry_tos2ZityB4futWmUTNmXN5q-NnZwIF_BqNv9n8E8').worksheet("telemeter") url = "https://telemeter-lts.datahub.redhat.com" token = "" with open('telemeter_token.txt', 'r') as file: api_token = file.read() pc = PrometheusConnect(url=url, headers={"Authorization": "bearer {}".format(token)}, disable_ssl=False) # gets query information for a specific date - each function call is a customized query def getClusterMetric(buildRow, time): data = None try: data = pc.custom_query(query='count(sum by (_id)(subscription_labels{managed="true"}) * 1)', params={"time":time.strftime('%Y-%m-%dT%H:%M:%SZ')}) for arr in data: buildRow.append(arr.get('value')[1]) except: pass def getCPUCores(buildRow, time): data = None try: data = pc.custom_query(query='sum(sum by (_id)(cluster:capacity_cpu_cores:sum) + \
def collect_metrics(configuration: Configuration, sli_report: SLIReport): """Collect metrics from Prometheus/Thanos.""" if not _DRY_RUN: pc = PrometheusConnect( url=configuration.thanos_url, headers={"Authorization": f"bearer {configuration.thanos_token}"}, disable_ssl=True, ) collected_info = {} for sli_name, sli_methods in sli_report.report_sli_context.items(): _LOGGER.info(f"Retrieving data for... {sli_name}") collected_info[sli_name] = {} for query_name, query_inputs in sli_methods["query"].items(): requires_range = False if isinstance(query_inputs, dict): query = query_inputs["query"] requires_range = query_inputs["requires_range"] action_type = query_inputs["type"] else: query = query_inputs _LOGGER.info(f"Querying... {query_name}") _LOGGER.info(f"Using query... {query}") try: if not _DRY_RUN: if requires_range: metric_data = pc.custom_query_range( query=query, start_time=configuration.start_time, end_time=configuration.end_time, step=configuration.step, ) else: metric_data = pc.custom_query(query=query) _LOGGER.info(f"Metric obtained... {metric_data}") if requires_range: metrics_vector = [ float(v[1]) for v in metric_data[0]["values"] if float(v[1]) > 0 ] result = manipulate_retrieved_metrics_vector( metrics_vector=metrics_vector, action=action_type) collected_info[sli_name][query_name] = result else: collected_info[sli_name][query_name] = float( metric_data[0]["value"][1]) else: metric_data = [{ "metric": "dry run", "value": [datetime.datetime.utcnow(), 0] }] result = float(metric_data[0]["value"][1]) collected_info[sli_name][query_name] = result except Exception as e: _LOGGER.exception( f"Could not gather metric for {sli_name}-{query_name}...{e}" ) pass collected_info[sli_name][query_name] = "ErrorMetricRetrieval" return collected_info