def get_gpu_number(self):
    max_available_gpu = 0
    pod_list = []
    ## Verify if dcgm-exporter is deployed
    try:
      pod_list = self.api_client.list_pod_for_all_namespaces(label_selector="app=nvidia-dcgm-exporter")
    except ApiException as e:
      if e.status != 404:
        _LOGGER.error("Exception when calling DCGM exporter pods: %s\n" % e)

    if len(pod_list.items) != 0:
      prom = PrometheusConnect(
        url=self.get_prometheus_url(),
        headers={"Authorization": "Bearer " + self.get_openshift_prometheus_token()},
        disable_ssl=True)

      for pod in pod_list.items:
        pod_IP = pod.status.pod_ip
        gpu_query = 'count (count by (UUID,GPU_I_ID) (DCGM_FI_PROF_GR_ENGINE_ACTIVE{instance="' + pod_IP +\
                    ':9400"}) or vector(0)) - count(count by (UUID,GPU_I_ID) (DCGM_FI_PROF_GR_ENGINE_ACTIVE{instance="'\
                    + pod_IP + ':9400", exported_pod=~".+"}) or vector(0))'

        get_available_gpu_in_node_data = prom.custom_query(query=gpu_query)

        get_available_gpu_in_node = int(get_available_gpu_in_node_data[0]['value'][1])

        if get_available_gpu_in_node > max_available_gpu:
            max_available_gpu = get_available_gpu_in_node
    return max_available_gpu
コード例 #2
0
ファイル: app.py プロジェクト: harshad16/slo-reporter
def collect_metrics():
    """Collect metrics from Prometheus/Thanos."""
    pc = PrometheusConnect(
        url=_THANOS_URL,
        headers={"Authorization": f"bearer {_THANOS_TOKEN}"},
        disable_ssl=True)

    collected_info = {}
    for sli_name, sli_methods in SLIReport.REPORT_SLI_CONTEXT.items():
        _LOGGER.info(f"Retrieving data for... {sli_name}")
        collected_info[sli_name] = {}
        for query_name, query in sli_methods["query"].items():
            _LOGGER.info(f"Querying... {query_name}")
            try:
                metric_data = pc.custom_query(query=query)
                _LOGGER.info(f"Metric obtained... {metric_data}")
                collected_info[sli_name][query_name] = float(
                    metric_data[0]["value"][1])
            except Exception as e:
                _LOGGER.exception(
                    f"Could not gather metric for {sli_name}-{query_name}...{e}"
                )
                pass
                collected_info[sli_name][query_name] = None

    return collected_info
コード例 #3
0
    def __init__(self, action):

        self.sample_info_dict = action
        self.uuid = action["uuid"]
        self.user = action["user"]
        self.cluster_name = action["cluster_name"]
        self.test_config = action["test_config"]

        # change datetime in seconds string to datetime object
        starttime = datetime.fromtimestamp(int(self.sample_info_dict["starttime"]))
        self.start = starttime

        # change datetime in seconds string to datetime object
        endtime = datetime.fromtimestamp(int(self.sample_info_dict["endtime"]))
        self.end = endtime

        # step value to be used in prometheus query
        # default is 30 seconds(openshift default scraping interval)
        # but can be overridden with env
        if "prom_step" in os.environ:
            self.T_Delta = os.environ["prom_step"]
        else:
            self.T_Delta = 30

        self.get_data = False
        if "prom_token" in os.environ and "prom_url" in os.environ:
            self.get_data = True
            token = os.environ["prom_token"]
            self.url = os.environ["prom_url"]
            bearer = "Bearer " + token
            self.headers = {'Authorization': bearer}
            self.pc = PrometheusConnect(url=self.url, headers=self.headers, disable_ssl=True)
        else:
            logger.warn("""snafu service account token and prometheus url not set \n
                        No Prometheus data will be indexed""")
コード例 #4
0
class Configuration:
    """Configuration of metrics-exporter."""

    # Prometheus
    URL = os.environ["PROMETHEUS_HOST_URL"]
    PROMETHEUS_SERVICE_ACCOUNT_TOKEN = os.environ[
        "PROMETHEUS_SERVICE_ACCOUNT_TOKEN"]
    HEADERS = {"Authorization": f"bearer {PROMETHEUS_SERVICE_ACCOUNT_TOKEN}"}
    PROM = PrometheusConnect(url=URL, disable_ssl=True, headers=HEADERS)

    # Namespaces
    THOTH_BACKEND_NAMESPACE = os.environ["THOTH_BACKEND_NAMESPACE"]
    THOTH_MIDDLETIER_NAMESPACE = os.environ["THOTH_MIDDLETIER_NAMESPACE"]
    THOTH_AMUN_INSPECTION_NAMESPACE = os.environ[
        "THOTH_AMUN_INSPECTION_NAMESPACE"]

    # Ceph
    CEPH_ACCESS_KEY_ID = os.environ["THOTH_CEPH_KEY_ID"]
    CEPH_ACCESS_SECRET_KEY = os.environ["THOTH_CEPH_SECRET_KEY"]
    CEPH_BUCKET_PREFIX = os.environ["THOTH_CEPH_BUCKET_PREFIX"]
    S3_ENDPOINT_URL = os.environ["THOTH_S3_ENDPOINT_URL"]
    CEPH_BUCKET = os.environ["THOTH_CEPH_BUCKET"]

    DEPLOYMENT_NAME = os.environ["THOTH_DEPLOYMENT_NAME"]

    # Kebechet
    GITHUB_ACCESS_TOKEN = os.environ["GITHUB_ACCESS_TOKEN"]
コード例 #5
0
def train_individual_model(predictor_model, initial_run):
    metric_to_predict = predictor_model.metric
    pc = PrometheusConnect(
    url=Configuration.prometheus_url,
    headers=Configuration.prom_connect_headers,
    disable_ssl=True,
    )

    data_start_time = datetime.now() - Configuration.metric_chunk_size
    if initial_run:
        data_start_time = (
            datetime.now() - Configuration.rolling_training_window_size
        )

    # Download new metric data from prometheus
    new_metric_data = pc.get_metric_range_data(
        metric_name=metric_to_predict.metric_name,
        label_config=metric_to_predict.label_config,
        start_time=data_start_time,
        end_time=datetime.now(),
    )[0]

    # Train the new model
    start_time = datetime.now()
    predictor_model.train(
            new_metric_data, Configuration.retraining_interval_minutes)

    _LOGGER.info(
        "Total Training time taken = %s, for metric: %s %s",
        str(datetime.now() - start_time),
        metric_to_predict.metric_name,
        metric_to_predict.label_config,
    )
    return predictor_model
コード例 #6
0
def main():
    try:
        # Setting up Mongo DB
        MONGO_HOST = str(os.environ.get('MONGO_HOST', '127.0.0.1'))
        MONGO_PORT = str(os.environ.get('MONGO_PORT', '27017'))
        MONGO_DB = str(os.environ.get('MONGO_DBNAME', 'cpa'))
        MONGO_USER = str(os.environ.get('MONGO_USERNAME', 'root'))
        MONGO_PASS = str(os.environ.get('MONGO_PASSWORD', 'iRhrF6O0vp'))
        mongodb_client = MongoClient(
            'mongodb://{}:{}@{}:{}/?authSource=admin'.format(
                MONGO_USER, MONGO_PASS, MONGO_HOST, MONGO_PORT))

        cpa_db = mongodb_client[MONGO_DB]
        deployments_collection = cpa_db.deployments
        list_of_deployments = []

        for deployment in deployments_collection.find():
            list_of_deployments = deployment['list']

        # Setting up Prometheus
        prometheus_base = str(
            os.environ.get('PROMETHEUS_URL', 'http://192.168.23.92:9090'))
        prom = PrometheusConnect(url=prometheus_base, disable_ssl=True)

        # get workload cpu
        query_workload_cpu = """
        sum(
          irate(container_cpu_usage_seconds_total{cluster="", namespace="default"}[2m])
        * on(namespace,pod)
          group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster="", namespace="default", workload_type="deployment"}
        ) by (workload, workload_type)
        """
        get_workload_cpu_query = lambda: prom.custom_query(query=
                                                           query_workload_cpu)

        def get_deployments_cpu_usage(list_of_deployments):
            wl_cpu_res = get_workload_cpu_query()
            # filter results (unit is millicores)
            filtered_cpu_query = {
                q['metric']['workload']: float(q['value'][1]) * 1000
                for q in wl_cpu_res
                if q['metric']['workload'] in list_of_deployments
            }
            # if metric skipped, put in None instead
            for d in list_of_deployments:
                if d not in filtered_cpu_query:
                    filtered_cpu_query[d] = None
            return filtered_cpu_query

        deployments_cpu = get_deployments_cpu_usage(list_of_deployments)

        # Parse spec into a dict
        # spec = json.loads(r'{"resource": {"kind": "Deployment", "apiVersion": "apps/v1", "metadata": {"name": "redis-cart", "namespace": "default", "uid": "1b25ec34-965e-4f57-9638-b95e78edfe41", "resourceVersion": "2238", "generation": 1, "creationTimestamp": "2021-02-13T06:18:09Z", "annotations": {"deployment.kubernetes.io/revision": "1", "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"apps/v1\",\"kind\":\"Deployment\",\"metadata\":{\"annotations\":{},\"name\":\"redis-cart\",\"namespace\":\"default\"},\"spec\":{\"selector\":{\"matchLabels\":{\"app\":\"redis-cart\"}},\"template\":{\"metadata\":{\"labels\":{\"app\":\"redis-cart\"}},\"spec\":{\"containers\":[{\"image\":\"redis:alpine\",\"livenessProbe\":{\"periodSeconds\":5,\"tcpSocket\":{\"port\":6379}},\"name\":\"redis\",\"ports\":[{\"containerPort\":6379}],\"readinessProbe\":{\"periodSeconds\":5,\"tcpSocket\":{\"port\":6379}},\"resources\":{\"limits\":{\"cpu\":\"125m\",\"memory\":\"256Mi\"},\"requests\":{\"cpu\":\"70m\",\"memory\":\"200Mi\"}},\"volumeMounts\":[{\"mountPath\":\"/data\",\"name\":\"redis-data\"}]}],\"volumes\":[{\"emptyDir\":{},\"name\":\"redis-data\"}]}}}}\n"}, "managedFields": [{"manager": "kubectl", "operation": "Update", "apiVersion": "apps/v1", "time": "2021-02-13T06:18:09Z", "fieldsType": "FieldsV1", "fieldsV1": {"f:metadata": {"f:annotations": {".": {}, "f:kubectl.kubernetes.io/last-applied-configuration": {}}}, "f:spec": {"f:progressDeadlineSeconds": {}, "f:replicas": {}, "f:revisionHistoryLimit": {}, "f:selector": {}, "f:strategy": {"f:rollingUpdate": {".": {}, "f:maxSurge": {}, "f:maxUnavailable": {}}, "f:type": {}}, "f:template": {"f:metadata": {"f:labels": {".": {}, "f:app": {}}}, "f:spec": {"f:containers": {"k:{\"name\":\"redis\"}": {".": {}, "f:image": {}, "f:imagePullPolicy": {}, "f:livenessProbe": {".": {}, "f:failureThreshold": {}, "f:periodSeconds": {}, "f:successThreshold": {}, "f:tcpSocket": {".": {}, "f:port": {}}, "f:timeoutSeconds": {}}, "f:name": {}, "f:ports": {".": {}, "k:{\"containerPort\":6379,\"protocol\":\"TCP\"}": {".": {}, "f:containerPort": {}, "f:protocol": {}}}, "f:readinessProbe": {".": {}, "f:failureThreshold": {}, "f:periodSeconds": {}, "f:successThreshold": {}, "f:tcpSocket": {".": {}, "f:port": {}}, "f:timeoutSeconds": {}}, "f:resources": {".": {}, "f:limits": {".": {}, "f:cpu": {}, "f:memory": {}}, "f:requests": {".": {}, "f:cpu": {}, "f:memory": {}}}, "f:terminationMessagePath": {}, "f:terminationMessagePolicy": {}, "f:volumeMounts": {".": {}, "k:{\"mountPath\":\"/data\"}": {".": {}, "f:mountPath": {}, "f:name": {}}}}}, "f:dnsPolicy": {}, "f:restartPolicy": {}, "f:schedulerName": {}, "f:securityContext": {}, "f:terminationGracePeriodSeconds": {}, "f:volumes": {".": {}, "k:{\"name\":\"redis-data\"}": {".": {}, "f:emptyDir": {}, "f:name": {}}}}}}}}, {"manager": "k3s", "operation": "Update", "apiVersion": "apps/v1", "time": "2021-02-13T06:18:21Z", "fieldsType": "FieldsV1", "fieldsV1": {"f:metadata": {"f:annotations": {"f:deployment.kubernetes.io/revision": {}}}, "f:status": {"f:availableReplicas": {}, "f:conditions": {".": {}, "k:{\"type\":\"Available\"}": {".": {}, "f:lastTransitionTime": {}, "f:lastUpdateTime": {}, "f:message": {}, "f:reason": {}, "f:status": {}, "f:type": {}}, "k:{\"type\":\"Progressing\"}": {".": {}, "f:lastTransitionTime": {}, "f:lastUpdateTime": {}, "f:message": {}, "f:reason": {}, "f:status": {}, "f:type": {}}}, "f:observedGeneration": {}, "f:readyReplicas": {}, "f:replicas": {}, "f:updatedReplicas": {}}}}]}, "spec": {"replicas": 1, "selector": {"matchLabels": {"app": "redis-cart"}}, "template": {"metadata": {"creationTimestamp": null, "labels": {"app": "redis-cart"}}, "spec": {"volumes": [{"name": "redis-data", "emptyDir": {}}], "containers": [{"name": "redis", "image": "redis:alpine", "ports": [{"containerPort": 6379, "protocol": "TCP"}], "resources": {"limits": {"cpu": "125m", "memory": "256Mi"}, "requests": {"cpu": "70m", "memory": "200Mi"}}, "volumeMounts": [{"name": "redis-data", "mountPath": "/data"}], "livenessProbe": {"tcpSocket": {"port": 6379}, "timeoutSeconds": 1, "periodSeconds": 5, "successThreshold": 1, "failureThreshold": 3}, "readinessProbe": {"tcpSocket": {"port": 6379}, "timeoutSeconds": 1, "periodSeconds": 5, "successThreshold": 1, "failureThreshold": 3}, "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent"}], "restartPolicy": "Always", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "securityContext": {}, "schedulerName": "default-scheduler"}}, "strategy": {"type": "RollingUpdate", "rollingUpdate": {"maxUnavailable": "25%", "maxSurge": "25%"}}, "revisionHistoryLimit": 10, "progressDeadlineSeconds": 600}, "status": {"observedGeneration": 1, "replicas": 1, "updatedReplicas": 1, "readyReplicas": 1, "availableReplicas": 1, "conditions": [{"type": "Available", "status": "True", "lastUpdateTime": "2021-02-13T06:18:21Z", "lastTransitionTime": "2021-02-13T06:18:21Z", "reason": "MinimumReplicasAvailable", "message": "Deployment has minimum availability."}, {"type": "Progressing", "status": "True", "lastUpdateTime": "2021-02-13T06:18:21Z", "lastTransitionTime": "2021-02-13T06:18:09Z", "reason": "NewReplicaSetAvailable", "message": "ReplicaSet \"redis-cart-74594bd569\" has successfully progressed."}]}}, "runType": "scaler"}')
        spec = json.loads(sys.stdin.read())

        metric(spec, list_of_deployments, deployments_cpu)
    except Exception as err:
        sys.stderr.write(f"Error metric: {err}")
        exit(1)
コード例 #7
0
    def client(self):
        verify = os.getenv('APIALCHEMY_PROMETHEUS_SSL_VERIFY',
                           'true').lower() == 'true'

        if not verify:
            urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

        return PrometheusConnect(**self._conn_params, disable_ssl=not verify)
コード例 #8
0
    def test_retry_on_error(self):  # noqa D102
        retry = Retry(total=3, backoff_factor=0.1, status_forcelist=[400])
        pc = PrometheusConnect(url=self.prometheus_host,
                               disable_ssl=True,
                               retry=retry)

        with self.assertRaises(requests.exceptions.RetryError,
                               msg="too many 400 error responses"):
            pc.custom_query("BOOM.BOOM!#$%")
コード例 #9
0
ファイル: prometheus.py プロジェクト: urfin78/weatherdisplay
 def __init__(self, host, port, disablessl):
     if disablessl == True:
         self.schema = "http"
     else:
         self.schema = "https"
     try:
         self.prom = PrometheusConnect(url=self.schema + "://" + host +
                                       ":" + port,
                                       disable_ssl=disablessl)
     except Exception:
         print("Fehler")
コード例 #10
0
ファイル: get_data.py プロジェクト: husky-parul/paocp
def pro():
    pc = PrometheusConnect(
        url="https://prometheus-k8s-openshift-monitoring.apps-crc.testing",
        headers={
            "Authorization":
            "bearer BSI2W0euoJWYRAvT0ZnSJVmgNQ87pl3o3yXuyy38qAg"
        },
        disable_ssl=True)
    up_metric = MetricsList(
        pc.get_current_metric_value(
            metric_name="haproxy_backend_up{exported_namespace='prophet'}"))
    print(up_metric[0])
コード例 #11
0
    def get_current(self) -> float:
        prom = PrometheusConnect(
            url=self.config.get("url", "http://localhost:9090"),
            disable_ssl=self.config.get("disable_ssl", True),
        )
        res = prom.custom_query(query=self.query)
        if not res:
            log.error("Prometheus query: no result")
            raise Exception("Prometheus query: no result")

        log.info(f"Prometheus query result: {res}")
        return float(res[0].get("value")[-1])
コード例 #12
0
def timed_job():
    config = configparser.ConfigParser()
    config.read('config/config.cfg')
    account = config.get('DEFAULT', 'ACCOUNT')
    key = config.get('DEFAULT', 'KEY')
    promi = config.get('DEFAULT', 'PROM')
    promup = promi.encode()
    container = config.get('DEFAULT', 'CONTAINER')
    url = config.get('DEFAULT', 'URL')
    blob_service = BlockBlobService(account_name=account, account_key=key)
    userAndPass = b64encode(promup).decode("ascii")
    headers = {'Authorization': 'Basic %s' % userAndPass}

    prom = PrometheusConnect(url=url, headers=headers, disable_ssl=False)
    metric_data = prom.all_metrics()

    time = datetime.now()
    metrics = []
    values = []

    for i in metric_data:
        metric = prom.get_metric_range_data(metric_name=i,
                                            start_time=time -
                                            timedelta(hours=1),
                                            end_time=time,
                                            chunk_size=timedelta(hours=1))
        x = int(0)
        for d in metric:
            for name, dct in d.items():
                dct = dict(dct)
                if name == 'metric':
                    dct['id'] = x
                    metrics.append(dct)
                else:
                    for key in dct:
                        va = {}
                        va['time'] = key
                        va['value'] = dct[key]
                        va['id'] = x
                        values.append(va)
                        x = x + 1

    df = pd.DataFrame(metrics)
    df1 = pd.DataFrame(values)
    df = pd.merge(df, df1, how='inner', left_on=['id'], right_on=['id'])
    df['time'] = pd.to_datetime(df['time'], unit='s')

    df = df.drop(['endpoint', 'service', 'id'], axis=1)
    write_pandas_dataframe_to_blob(
        blob_service, df, container,
        str((datetime.now()).date()) + '/' +
        str(datetime.now().time()).replace(':', '').replace(".", ''))
コード例 #13
0
def launch_prometheus():
    if kube_env.check_kubernetes_status() != util.EXIT_SUCCESS:
        log.error("Kubernetes is not set up."
                  " Did you run the deployment script?")
        sys.exit(util.EXIT_FAILURE)
    cmd = "kubectl get pods -n istio-system -lapp=prometheus "
    cmd += " -o jsonpath={.items[0].metadata.name}"
    prom_pod_name = util.get_output_from_proc(cmd).decode("utf-8")
    cmd = f"kubectl port-forward -n istio-system {prom_pod_name} 9090"
    prom_proc = util.start_process(cmd, preexec_fn=os.setsid)
    time.sleep(2)
    prom_api = PrometheusConnect(url="http://localhost:9090", disable_ssl=True)

    return prom_proc, prom_api
コード例 #14
0
ファイル: capture_helper.py プロジェクト: DatLQ95/tue_drl_vnf
 def __init__(self, docker_client_services_path,
              docker_server_services_path, ingress_distribution_file_path,
              docker_lb_container_path, service_list):
     self.docker_client_services = get_docker_services(
         docker_client_services_path)
     self.docker_server_services = get_docker_services(
         docker_server_services_path)
     self.get_ingress_distribution = get_docker_services(
         ingress_distribution_file_path)
     self.docker_lb_services = get_docker_services(docker_lb_container_path)
     self.prom = PrometheusConnect(url="http://131.155.35.54:9090",
                                   disable_ssl=True)
     self.capture_time = CAPTURE_TIME
     self.service_list = service_list
     pass
コード例 #15
0
ファイル: app.py プロジェクト: goern/thoth-slo-reporter
def check_database_metrics_availability(configuration: Configuration) -> bool:
    """Check database metrics (Prometheus/Thanos) availability."""
    pc = PrometheusConnect(
        url=configuration.thanos_url,
        headers={"Authorization": f"bearer {configuration.thanos_token}"},
        disable_ssl=True,
    )
    response = pc._session.get(
        "{0}/".format(pc.url),
        verify=pc.ssl_verification,
        headers=pc.headers,
        params={},
    )
    if not response.ok:
        return False

    return True
コード例 #16
0
ファイル: app.py プロジェクト: CoderKevinZhang/AI-SIG
def profiling(url, pod_ip, ana_window='2m', metrics=MEM_UTIL):
    """if key exists, the value will be replaced,
       add dynamic status
       {ai.centaurus.io/gpu0:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:1},
        ai.centaurus.io/gpu1:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:2, processes:[{pid:25678, cur_mem_used:3GB},{pid:67234, cur_mem_used:1GB}]}                                 
       }
    """
    ret_dict = dict()
    promi = PrometheusConnect(url=url, disable_ssl=True)
    # except connection error
    try:
        promi.check_prometheus_connection()
    except Exception as e:
        logging.error(e)
        return ret_dict  # if connectioin fails, return empty dict
    instance = pod_ip + ":9400" # tmp fixed
    start_time = parse_datetime(ana_window)
    end_time = parse_datetime("now")
    my_label_config = {"instance": instance}  # select current host metrics
    metric_data = promi.get_metric_range_data(metric_name=metrics,
                                              label_config=my_label_config,
                                              start_time=start_time,
                                              end_time=end_time)
    # reorganize data to label_config and metric_values
    metric_object_list = MetricsList(metric_data)
    ret_dict = dict()
    for item in metric_object_list: # iterate through all the gpus on the node
        if 'gpu' not in item.label_config: # handle metric config info exception
            continue
        id = item.label_config['gpu']  # predefined key from dcgm (gpu index)
        # ip = item.label_config['instance']
        key = DOMAIN + "/gpu-" + id
        cur_usage = collect_cur_usage(int(id))
        ts = item.metric_values.iloc[:, 1]  # metrics_values are two row df, 1st is timestamp, 2nd is value
        cur_usage['cyclic_pattern'] = False
        if ts.max() > 0:
            cyclic, period = cyclic_pattern_detection(ts)
            if cyclic:
                cur_usage['cyclic_pattern'] = True
                cur_usage['period'] = str(period)       
        cur_usage['max_mem_util'] = str(ts.max())
        # Important: flatten nested dictionary to string, otherwise error "cannot unmarshal string into Go value of type map[string]interface {}""
        ret_dict[key] = str(cur_usage)
    return ret_dict
コード例 #17
0
def update_saved_prom_metrics(metrics, save_dir):
    # connect to prometheus
    prom_url = os.getenv("FLT_PROM_URL",
                         "https://telemeter-lts.datahub.redhat.com/")
    prom_access_token = os.getenv("FLT_PROM_ACCESS_TOKEN")
    pc = PrometheusConnect(
        url=prom_url,
        headers={"Authorization": f"bearer {prom_access_token}"},
        disable_ssl=True,
    )

    # get metrics if avaiable
    if "cluster_operator_conditions" in metrics:
        conditions_df = metric_preprocessors.opconds_metrics_to_df(
            metrics_raw=pc.get_current_metric_value(
                "cluster_operator_conditions"))
    if "cluster_installer" in metrics:
        install_df = metric_preprocessors.installer_metrics_to_df(
            metrics_raw=pc.get_current_metric_value("cluster_installer"))
    if "cluster_version" in metrics:
        versions_df = metric_preprocessors.version_metrics_to_df(
            metrics_raw=pc.get_current_metric_value("cluster_version"))

    # combine all metrics
    metrics_df = conditions_df.merge(install_df,
                                     how="left",
                                     left_index=True,
                                     right_index=True)
    metrics_df = metrics_df.merge(versions_df,
                                  how="left",
                                  left_index=True,
                                  right_index=True)

    # nans because some install types are neither upi nor ipi (unknown)
    metrics_df["install_type_IPI"] = metrics_df["install_type_IPI"].fillna(0)
    metrics_df["install_type_UPI"] = metrics_df["install_type_UPI"].fillna(0)

    # save to volume
    metrics_df.to_parquet(
        fname=os.path.join(save_dir, "metrics.parquet"),
        engine="pyarrow",
        index=True,
    )
コード例 #18
0
def queryMetrics(customquery, trim):
    # print("\n queryMetrics START\n")

    prom = PrometheusConnect(url ="http://localhost:9090", disable_ssl=True)

    data = prom.custom_query(query=customquery,
     )
    # To make it a table where each row is a metric
    df = MetricSnapshotDataFrame(data)
    df = df[df.value != "NaN"]

    df[['value']] = df[['value']].apply(pd.to_numeric)
    df[['timestamp']] = df[['timestamp']].apply(pd.to_datetime, unit='s')

    sortedDf = df.sort_values('value', ascending=False).head(trim)

    # print(nicenumbers)
    # print(df.index)
    # print(df.columns)
    # print("\n queryMetrics END\n")
    return sortedDf
コード例 #19
0
def build_reports(timestamp, config, es_url, thanos_url, grafana_url, target_index):
    es_client = Elasticsearch(es_url)
    thanos_client = PrometheusConnect(thanos_url, disable_ssl=True)
    clusters, docs = collect.get_clusters(es_client, timestamp, indices=config['searchIndices'])
    reports = []
    for cluster in clusters:
        benchmarks = collect.get_benchmarks_for_cluster(cluster['cluster_name'], docs, config['ignoreTags'])
        for benchmark in benchmarks:
            report = {
                **cluster,
                'report_type': 'podLatency',
                'metadata': benchmark['metadata'],
                "results": collect.get_benchmark_results(benchmark, es_client)
            }

            if report['results'] != {}:
                print(f"cluster {report['cluster_name']} has results")
                reports.append(report)
    
 
    for report in enrich.enrich_reports(reports, grafana_url, thanos_client, config):
        response = index.index_report(es_client, report, target_index)
        print(response) 
コード例 #20
0
import tornado.ioloop
import tornado.web
from tornado.httpserver import HTTPServer
from prometheus_client import Gauge, generate_latest, REGISTRY
from prometheus_api_client import PrometheusConnect, Metric
from configuration import Configuration
from graph_handler import GraphHandler
import schedule

_LOGGER = logging.getLogger(__name__)

PREDICTOR_MODEL_LIST = list()

pc = PrometheusConnect(
    url=Configuration.prometheus_url,
    headers=Configuration.prometheus_headers,
    disable_ssl=True,
)

for metric in Configuration.metrics_list:
    metric_init = pc.get_current_metric_value(metric_name=metric)

    for unique_metric in metric_init:
        PREDICTOR_MODEL_LIST.append(
            Configuration.algorithm(
                unique_metric,
                rolling_data_window_size=Configuration.
                rolling_training_window_size,
            ))

GAUGE_DICT = dict()
 def setUp(self):
     """
     set up connection settings for prometheus
     """
     self.prometheus_host = os.getenv("PROM_URL")
     self.pc = PrometheusConnect(url=self.prometheus_host, disable_ssl=True)
 def setUp(self):
     self.pc = PrometheusConnect(url='http://doesnt_matter.xyz',
                                 disable_ssl=True)
コード例 #23
0
 def __init__(self, nodes: NodeDataView):
     super().__init__(nodes)
     self._prom = PrometheusConnect(url=settings.prometheus.url)
コード例 #24
0
def process_period(config, period):
    period_start = period['instant'] + dateutil.relativedelta.relativedelta(
        seconds=-period['range_sec'])
    print(
        f"Processing year {period['year']}, month {period['month']}, "
        f"querying from {period['instant'].isoformat()} and going back {period['range_sec']} s to {period_start.isoformat()}."
    )
    queries = QueryLogic(queryRange=(str(period['range_sec']) + 's'))

    # SSL generally not used for Prometheus access within a cluster
    # Docs on instant query API: https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries
    prom = PrometheusConnect(url=config.prometheus_server, disable_ssl=True)
    prom_connect_params = {
        'time': period['instant'].isoformat(),
        'timeout': config.query_timeout
    }

    raw_results, results, result_lengths = {}, {}, []
    # iterate over each query (cputime, starttime, endtime, cores) producing raw_results['cputime'] etc.
    for query_name, query_string in vars(queries).items():
        # Each of these raw_results is a list of dicts. Each dict in the list represents an individual data point, and contains:
        # 'metric': a dict of one or more key-value pairs of labels, one of which is the pod name ('exported_pod').
        # 'value': a list in which the 0th element is the timestamp of the value, and 1th element is the actual value we're interested in.
        print(f'Executing {query_name} query: {query_string}')
        t1 = timer()
        raw_results[query_name] = prom.custom_query(query=query_string,
                                                    params=prom_connect_params)
        t2 = timer()
        results[query_name] = dict(rearrange(raw_results[query_name]))
        result_lengths.append(len(results[query_name]))
        t3 = timer()
        print(
            f'Query finished in {t2 - t1} s, processed in {t3 - t2} s. Got {len(results[query_name])} items from {len(raw_results[query_name])} results. Peak RAM usage: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss}K.'
        )
        del raw_results[query_name]

    cputime = results['cputime']
    endtime = results['endtime']
    starttime = results['starttime']
    cores = results['cores']

    # Confirm the assumption that cputime should have the fewest entries, while starttime and cores may have additional ones
    # corresponding to jobs that have started but not finished yet, and endtime may have additional ones if there are pods without CPU resource requests.
    # We only want the jobs for which all values are available: start time, end time, CPU request.
    # Note that jobs which started last month and finished this month will be properly included and accounted in this month.
    assert len(cputime) == min(
        result_lengths), "cputime should be the shortest list"
    # However, jobs that finished last month may show up in this month's data if they are still present on the cluster this month (in Completed state).
    # Exclude them by filtering with a lambda (since you can't pass an argument to a function object AFAIK).
    endtime = dict(
        filter(lambda x: x[1] >= datetime.datetime.timestamp(period_start),
               endtime.items()))
    # Prepare to iterate over jobs which meet all criteria.
    valid_jobs = cputime.keys() & endtime.keys()
    # avoid sending empty records
    if len(valid_jobs) == 0:
        print('No records to process.')
        return

    sum_cputime = 0
    t4 = timer()
    for key in valid_jobs:
        assert endtime[key] > starttime[
            key], "job end time is before start time"
        # double check cputime calc of this job
        delta = abs(cputime[key] -
                    (endtime[key] - starttime[key]) * cores[key])
        assert delta < 0.001, "cputime calculation is inaccurate"
        sum_cputime += cputime[key]

    # CPU time as calculated here means (# cores * job duration), which apparently corresponds to
    # the concept of wall time in APEL accounting. It is not clear what CPU time means in APEL;
    # could be the actual CPU usage % integrated over the job (# cores * job duration * usage)
    # but this does not seem to be documented clearly. Some batch systems do not actually measure
    # this so it is not reported consistently or accurately. Some sites have CPU efficiency
    # (presumably defined as CPU time / wall time) time that is up to ~ 500% of the walltime, or
    # always fixed at 100%. In Kubernetes, the actual CPU usage % is tracked by metrics server
    # (not KSM), which is not meant to be used for monitoring or accounting purposes and is not
    # scraped by Prometheus. So just use walltime = cputime
    sum_cputime = round(sum_cputime)
    sum_walltime = sum_cputime

    print(f'total cputime: {sum_cputime}, total walltime: {sum_walltime}')
    # Write output to the message queue on local filesystem
    # https://dirq.readthedocs.io/en/latest/queuesimple.html#directory-structure
    dirq = QueueSimple(str(config.output_path))
    summary_output = summary_message(
        config,
        year=period['year'],
        month=period['month'],
        wall_time=sum_walltime,
        cpu_time=sum_cputime,
        n_jobs=len(endtime),
        # this appears faster than getting min/max during the dict iteration above
        first_end=round(min(endtime.values())),
        last_end=round(max(endtime.values())))
    sync_output = sync_message(config,
                               year=period['year'],
                               month=period['month'],
                               n_jobs=len(endtime))
    t5 = timer()
    summary_file = dirq.add(summary_output)
    sync_file = dirq.add(sync_output)
    print(f'Analyzed {len(endtime)} records in {t5 - t4} s.')
    print(f'Writing summary record to {config.output_path}/{summary_file}:')
    print('--------------------------------\n' + summary_output +
          '--------------------------------')
    print(f'Writing sync record to {config.output_path}/{sync_file}:')
    print('--------------------------------\n' + sync_output +
          '--------------------------------')
コード例 #25
0
from util import get_interval_minutes
from config_parser import getSettings
from prometheus_api_client import PrometheusConnect
from prometheus_api_client.utils import parse_timedelta
from datetime import datetime
from hashlib import md5
import json

headers = None
settings = getSettings('prometheus')
if 'access_token' in settings:
    headers = {'Autorization': 'bearer ' + settings['access_token']}

pc = PrometheusConnect(
    url=settings['url'],
    headers=headers,
    disable_ssl=True,
)


def query_range(expr, time_range, resolution, fill_na):
    fill_method = {
        'zeros': fill_na_zeros,
        'default': None
    }[fill_na or 'default']

    delta = parse_timedelta('now', time_range)
    start_time = datetime.now() - delta
    end_time = datetime.now()

    data = pc.custom_query_range(query=expr,
コード例 #26
0
 def setUp(self):  # noqa D102
     self.pc = PrometheusConnect(url="http://doesnt_matter.xyz",
                                 disable_ssl=True)
コード例 #27
0
ファイル: dailysummary.py プロジェクト: thatsk/junkcode
 def __init__(self, url, disable_ssl=False):
     self.prom = PrometheusConnect(url=url, disable_ssl=disable_ssl)
コード例 #28
0
def query_prom_data_range(svc_names,
                          query_fn,
                          start_time,
                          end_time,
                          sampling_rate=1,
                          is_summary=False,
                          url="http://vmhost1.local:9090"):
    """Query Prometheus metric data for customized services during customized time range.
    
    Params:
        svc_names: service metric names
        query_fn: function to construct the Prometheus query string from the service name.
        start_time: start time. A datetime.datetime object.
        end_time: same as start. A datetime.datetime object.
        sampling_rate: float, in seconds.
        is_summary: Boolean to represent whether the query is a summary with quantiles.
    
    Returns:
        all_metric_data: A dict of all metric data. Keys are service names. 
            Values are dict containing timestamps and values (If is_summary is True, there are multiple timestamp and value items).
    """
    def append_data(d, key, l):
        if key in d:
            d[key].append(l)
        else:
            d[key] = [l]

    prom = PrometheusConnect(url=url, disable_ssl=True)
    all_metric_data = {}
    for n in svc_names:
        query = query_fn(n)

        # Split into 3-hour batch and get one batch at a time.
        batch_len = datetime.timedelta(hours=3)
        batch_start = start_time
        batch_end = start_time + batch_len
        timestamps_dict = {}
        values_dict = {}
        metric_info = None
        while batch_start < end_time:
            if batch_end >= end_time:
                batch_end = end_time
            metric_data = prom.custom_query_range(query=query,
                                                  start_time=batch_start,
                                                  end_time=batch_end,
                                                  step=sampling_rate)
            # Sometimes there are no metric data within the range. Skip processing.
            if len(metric_data) > 0:
                if metric_info is None:
                    metric_info = {}
                    metric_info['metric'] = metric_data[0]['metric'].copy()

                for one_data in metric_data:
                    raw_values = np.array(one_data['values'], dtype=np.float64)
                    # Retrive multiple time series data for different quantiles.
                    if is_summary is True:
                        # Remove quantile from metric info.
                        metric_info['metric'].pop('quantile', None)
                        key = 'q' + one_data['metric']['quantile']
                    else:
                        # Only one time series
                        key = 'data'
                    append_data(timestamps_dict, key, raw_values[:, 0])
                    append_data(values_dict, key, raw_values[:, 1])

            # Because the previous range [batch_start, batch_end] is inclusive at both ends.
            # We move to the next timestamp here.
            batch_start = batch_end + datetime.timedelta(seconds=sampling_rate)
            batch_end = batch_start + batch_len

        def concat(d, name, conv_type=np.float64):
            for k, v in d.items():
                merged_v = np.concatenate(v).astype(conv_type)
                metric_info[f'{name}_{k}'] = merged_v

        concat(timestamps_dict, 'timestamps', conv_type=np.int64)
        concat(values_dict, 'values')
        all_metric_data[n] = metric_info
    return all_metric_data
コード例 #29
0
import re

import gspread
from oauth2client.service_account import ServiceAccountCredentials

scope = ["https://spreadsheets.google.com/feeds",'https://www.googleapis.com/auth/spreadsheets',"https://www.googleapis.com/auth/drive.file","https://www.googleapis.com/auth/drive"]
creds = ServiceAccountCredentials.from_json_keyfile_name("../client_secret.json", scope)
client = gspread.authorize(creds)
sheet = client.open_by_key('1ry_tos2ZityB4futWmUTNmXN5q-NnZwIF_BqNv9n8E8').worksheet("telemeter")

url = "https://telemeter-lts.datahub.redhat.com"
token = ""
with open('telemeter_token.txt', 'r') as file:
    api_token = file.read()

pc = PrometheusConnect(url=url, headers={"Authorization": "bearer {}".format(token)}, disable_ssl=False)

# gets query information for a specific date - each function call is a customized query
def getClusterMetric(buildRow, time):
    data = None
    try:
        data = pc.custom_query(query='count(sum by (_id)(subscription_labels{managed="true"}) * 1)', params={"time":time.strftime('%Y-%m-%dT%H:%M:%SZ')})      
        for arr in data:
            buildRow.append(arr.get('value')[1])
    except:
        pass

def getCPUCores(buildRow, time):
    data = None
    try:
        data = pc.custom_query(query='sum(sum by (_id)(cluster:capacity_cpu_cores:sum) + \
コード例 #30
0
ファイル: app.py プロジェクト: bissenbay/slo-reporter
def collect_metrics(configuration: Configuration, sli_report: SLIReport):
    """Collect metrics from Prometheus/Thanos."""
    if not _DRY_RUN:
        pc = PrometheusConnect(
            url=configuration.thanos_url,
            headers={"Authorization": f"bearer {configuration.thanos_token}"},
            disable_ssl=True,
        )

    collected_info = {}

    for sli_name, sli_methods in sli_report.report_sli_context.items():
        _LOGGER.info(f"Retrieving data for... {sli_name}")
        collected_info[sli_name] = {}

        for query_name, query_inputs in sli_methods["query"].items():

            requires_range = False

            if isinstance(query_inputs, dict):
                query = query_inputs["query"]
                requires_range = query_inputs["requires_range"]
                action_type = query_inputs["type"]
            else:
                query = query_inputs

            _LOGGER.info(f"Querying... {query_name}")
            _LOGGER.info(f"Using query... {query}")

            try:
                if not _DRY_RUN:

                    if requires_range:
                        metric_data = pc.custom_query_range(
                            query=query,
                            start_time=configuration.start_time,
                            end_time=configuration.end_time,
                            step=configuration.step,
                        )

                    else:
                        metric_data = pc.custom_query(query=query)

                    _LOGGER.info(f"Metric obtained... {metric_data}")

                    if requires_range:
                        metrics_vector = [
                            float(v[1]) for v in metric_data[0]["values"]
                            if float(v[1]) > 0
                        ]
                        result = manipulate_retrieved_metrics_vector(
                            metrics_vector=metrics_vector, action=action_type)

                        collected_info[sli_name][query_name] = result

                    else:
                        collected_info[sli_name][query_name] = float(
                            metric_data[0]["value"][1])

                else:
                    metric_data = [{
                        "metric": "dry run",
                        "value": [datetime.datetime.utcnow(), 0]
                    }]
                    result = float(metric_data[0]["value"][1])
                    collected_info[sli_name][query_name] = result

            except Exception as e:
                _LOGGER.exception(
                    f"Could not gather metric for {sli_name}-{query_name}...{e}"
                )
                pass
                collected_info[sli_name][query_name] = "ErrorMetricRetrieval"

    return collected_info