Example #1
0
def train_individual_model(predictor_model, initial_run):
    metric_to_predict = predictor_model.metric
    pc = PrometheusConnect(
    url=Configuration.prometheus_url,
    headers=Configuration.prom_connect_headers,
    disable_ssl=True,
    )

    data_start_time = datetime.now() - Configuration.metric_chunk_size
    if initial_run:
        data_start_time = (
            datetime.now() - Configuration.rolling_training_window_size
        )

    # Download new metric data from prometheus
    new_metric_data = pc.get_metric_range_data(
        metric_name=metric_to_predict.metric_name,
        label_config=metric_to_predict.label_config,
        start_time=data_start_time,
        end_time=datetime.now(),
    )[0]

    # Train the new model
    start_time = datetime.now()
    predictor_model.train(
            new_metric_data, Configuration.retraining_interval_minutes)

    _LOGGER.info(
        "Total Training time taken = %s, for metric: %s %s",
        str(datetime.now() - start_time),
        metric_to_predict.metric_name,
        metric_to_predict.label_config,
    )
    return predictor_model
    def __init__(self, action):

        self.sample_info_dict = action
        self.uuid = action["uuid"]
        self.user = action["user"]
        self.cluster_name = action["cluster_name"]
        self.test_config = action["test_config"]

        # change datetime in seconds string to datetime object
        starttime = datetime.fromtimestamp(int(self.sample_info_dict["starttime"]))
        self.start = starttime

        # change datetime in seconds string to datetime object
        endtime = datetime.fromtimestamp(int(self.sample_info_dict["endtime"]))
        self.end = endtime

        # step value to be used in prometheus query
        # default is 30 seconds(openshift default scraping interval)
        # but can be overridden with env
        if "prom_step" in os.environ:
            self.T_Delta = os.environ["prom_step"]
        else:
            self.T_Delta = 30

        self.get_data = False
        if "prom_token" in os.environ and "prom_url" in os.environ:
            self.get_data = True
            token = os.environ["prom_token"]
            self.url = os.environ["prom_url"]
            bearer = "Bearer " + token
            self.headers = {'Authorization': bearer}
            self.pc = PrometheusConnect(url=self.url, headers=self.headers, disable_ssl=True)
        else:
            logger.warn("""snafu service account token and prometheus url not set \n
                        No Prometheus data will be indexed""")
  def get_gpu_number(self):
    max_available_gpu = 0
    pod_list = []
    ## Verify if dcgm-exporter is deployed
    try:
      pod_list = self.api_client.list_pod_for_all_namespaces(label_selector="app=nvidia-dcgm-exporter")
    except ApiException as e:
      if e.status != 404:
        _LOGGER.error("Exception when calling DCGM exporter pods: %s\n" % e)

    if len(pod_list.items) != 0:
      prom = PrometheusConnect(
        url=self.get_prometheus_url(),
        headers={"Authorization": "Bearer " + self.get_openshift_prometheus_token()},
        disable_ssl=True)

      for pod in pod_list.items:
        pod_IP = pod.status.pod_ip
        gpu_query = 'count (count by (UUID,GPU_I_ID) (DCGM_FI_PROF_GR_ENGINE_ACTIVE{instance="' + pod_IP +\
                    ':9400"}) or vector(0)) - count(count by (UUID,GPU_I_ID) (DCGM_FI_PROF_GR_ENGINE_ACTIVE{instance="'\
                    + pod_IP + ':9400", exported_pod=~".+"}) or vector(0))'

        get_available_gpu_in_node_data = prom.custom_query(query=gpu_query)

        get_available_gpu_in_node = int(get_available_gpu_in_node_data[0]['value'][1])

        if get_available_gpu_in_node > max_available_gpu:
            max_available_gpu = get_available_gpu_in_node
    return max_available_gpu
Example #4
0
def collect_metrics():
    """Collect metrics from Prometheus/Thanos."""
    pc = PrometheusConnect(
        url=_THANOS_URL,
        headers={"Authorization": f"bearer {_THANOS_TOKEN}"},
        disable_ssl=True)

    collected_info = {}
    for sli_name, sli_methods in SLIReport.REPORT_SLI_CONTEXT.items():
        _LOGGER.info(f"Retrieving data for... {sli_name}")
        collected_info[sli_name] = {}
        for query_name, query in sli_methods["query"].items():
            _LOGGER.info(f"Querying... {query_name}")
            try:
                metric_data = pc.custom_query(query=query)
                _LOGGER.info(f"Metric obtained... {metric_data}")
                collected_info[sli_name][query_name] = float(
                    metric_data[0]["value"][1])
            except Exception as e:
                _LOGGER.exception(
                    f"Could not gather metric for {sli_name}-{query_name}...{e}"
                )
                pass
                collected_info[sli_name][query_name] = None

    return collected_info
Example #5
0
def main():
    try:
        # Setting up Mongo DB
        MONGO_HOST = str(os.environ.get('MONGO_HOST', '127.0.0.1'))
        MONGO_PORT = str(os.environ.get('MONGO_PORT', '27017'))
        MONGO_DB = str(os.environ.get('MONGO_DBNAME', 'cpa'))
        MONGO_USER = str(os.environ.get('MONGO_USERNAME', 'root'))
        MONGO_PASS = str(os.environ.get('MONGO_PASSWORD', 'iRhrF6O0vp'))
        mongodb_client = MongoClient(
            'mongodb://{}:{}@{}:{}/?authSource=admin'.format(
                MONGO_USER, MONGO_PASS, MONGO_HOST, MONGO_PORT))

        cpa_db = mongodb_client[MONGO_DB]
        deployments_collection = cpa_db.deployments
        list_of_deployments = []

        for deployment in deployments_collection.find():
            list_of_deployments = deployment['list']

        # Setting up Prometheus
        prometheus_base = str(
            os.environ.get('PROMETHEUS_URL', 'http://192.168.23.92:9090'))
        prom = PrometheusConnect(url=prometheus_base, disable_ssl=True)

        # get workload cpu
        query_workload_cpu = """
        sum(
          irate(container_cpu_usage_seconds_total{cluster="", namespace="default"}[2m])
        * on(namespace,pod)
          group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster="", namespace="default", workload_type="deployment"}
        ) by (workload, workload_type)
        """
        get_workload_cpu_query = lambda: prom.custom_query(query=
                                                           query_workload_cpu)

        def get_deployments_cpu_usage(list_of_deployments):
            wl_cpu_res = get_workload_cpu_query()
            # filter results (unit is millicores)
            filtered_cpu_query = {
                q['metric']['workload']: float(q['value'][1]) * 1000
                for q in wl_cpu_res
                if q['metric']['workload'] in list_of_deployments
            }
            # if metric skipped, put in None instead
            for d in list_of_deployments:
                if d not in filtered_cpu_query:
                    filtered_cpu_query[d] = None
            return filtered_cpu_query

        deployments_cpu = get_deployments_cpu_usage(list_of_deployments)

        # Parse spec into a dict
        # spec = json.loads(r'{"resource": {"kind": "Deployment", "apiVersion": "apps/v1", "metadata": {"name": "redis-cart", "namespace": "default", "uid": "1b25ec34-965e-4f57-9638-b95e78edfe41", "resourceVersion": "2238", "generation": 1, "creationTimestamp": "2021-02-13T06:18:09Z", "annotations": {"deployment.kubernetes.io/revision": "1", "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"apps/v1\",\"kind\":\"Deployment\",\"metadata\":{\"annotations\":{},\"name\":\"redis-cart\",\"namespace\":\"default\"},\"spec\":{\"selector\":{\"matchLabels\":{\"app\":\"redis-cart\"}},\"template\":{\"metadata\":{\"labels\":{\"app\":\"redis-cart\"}},\"spec\":{\"containers\":[{\"image\":\"redis:alpine\",\"livenessProbe\":{\"periodSeconds\":5,\"tcpSocket\":{\"port\":6379}},\"name\":\"redis\",\"ports\":[{\"containerPort\":6379}],\"readinessProbe\":{\"periodSeconds\":5,\"tcpSocket\":{\"port\":6379}},\"resources\":{\"limits\":{\"cpu\":\"125m\",\"memory\":\"256Mi\"},\"requests\":{\"cpu\":\"70m\",\"memory\":\"200Mi\"}},\"volumeMounts\":[{\"mountPath\":\"/data\",\"name\":\"redis-data\"}]}],\"volumes\":[{\"emptyDir\":{},\"name\":\"redis-data\"}]}}}}\n"}, "managedFields": [{"manager": "kubectl", "operation": "Update", "apiVersion": "apps/v1", "time": "2021-02-13T06:18:09Z", "fieldsType": "FieldsV1", "fieldsV1": {"f:metadata": {"f:annotations": {".": {}, "f:kubectl.kubernetes.io/last-applied-configuration": {}}}, "f:spec": {"f:progressDeadlineSeconds": {}, "f:replicas": {}, "f:revisionHistoryLimit": {}, "f:selector": {}, "f:strategy": {"f:rollingUpdate": {".": {}, "f:maxSurge": {}, "f:maxUnavailable": {}}, "f:type": {}}, "f:template": {"f:metadata": {"f:labels": {".": {}, "f:app": {}}}, "f:spec": {"f:containers": {"k:{\"name\":\"redis\"}": {".": {}, "f:image": {}, "f:imagePullPolicy": {}, "f:livenessProbe": {".": {}, "f:failureThreshold": {}, "f:periodSeconds": {}, "f:successThreshold": {}, "f:tcpSocket": {".": {}, "f:port": {}}, "f:timeoutSeconds": {}}, "f:name": {}, "f:ports": {".": {}, "k:{\"containerPort\":6379,\"protocol\":\"TCP\"}": {".": {}, "f:containerPort": {}, "f:protocol": {}}}, "f:readinessProbe": {".": {}, "f:failureThreshold": {}, "f:periodSeconds": {}, "f:successThreshold": {}, "f:tcpSocket": {".": {}, "f:port": {}}, "f:timeoutSeconds": {}}, "f:resources": {".": {}, "f:limits": {".": {}, "f:cpu": {}, "f:memory": {}}, "f:requests": {".": {}, "f:cpu": {}, "f:memory": {}}}, "f:terminationMessagePath": {}, "f:terminationMessagePolicy": {}, "f:volumeMounts": {".": {}, "k:{\"mountPath\":\"/data\"}": {".": {}, "f:mountPath": {}, "f:name": {}}}}}, "f:dnsPolicy": {}, "f:restartPolicy": {}, "f:schedulerName": {}, "f:securityContext": {}, "f:terminationGracePeriodSeconds": {}, "f:volumes": {".": {}, "k:{\"name\":\"redis-data\"}": {".": {}, "f:emptyDir": {}, "f:name": {}}}}}}}}, {"manager": "k3s", "operation": "Update", "apiVersion": "apps/v1", "time": "2021-02-13T06:18:21Z", "fieldsType": "FieldsV1", "fieldsV1": {"f:metadata": {"f:annotations": {"f:deployment.kubernetes.io/revision": {}}}, "f:status": {"f:availableReplicas": {}, "f:conditions": {".": {}, "k:{\"type\":\"Available\"}": {".": {}, "f:lastTransitionTime": {}, "f:lastUpdateTime": {}, "f:message": {}, "f:reason": {}, "f:status": {}, "f:type": {}}, "k:{\"type\":\"Progressing\"}": {".": {}, "f:lastTransitionTime": {}, "f:lastUpdateTime": {}, "f:message": {}, "f:reason": {}, "f:status": {}, "f:type": {}}}, "f:observedGeneration": {}, "f:readyReplicas": {}, "f:replicas": {}, "f:updatedReplicas": {}}}}]}, "spec": {"replicas": 1, "selector": {"matchLabels": {"app": "redis-cart"}}, "template": {"metadata": {"creationTimestamp": null, "labels": {"app": "redis-cart"}}, "spec": {"volumes": [{"name": "redis-data", "emptyDir": {}}], "containers": [{"name": "redis", "image": "redis:alpine", "ports": [{"containerPort": 6379, "protocol": "TCP"}], "resources": {"limits": {"cpu": "125m", "memory": "256Mi"}, "requests": {"cpu": "70m", "memory": "200Mi"}}, "volumeMounts": [{"name": "redis-data", "mountPath": "/data"}], "livenessProbe": {"tcpSocket": {"port": 6379}, "timeoutSeconds": 1, "periodSeconds": 5, "successThreshold": 1, "failureThreshold": 3}, "readinessProbe": {"tcpSocket": {"port": 6379}, "timeoutSeconds": 1, "periodSeconds": 5, "successThreshold": 1, "failureThreshold": 3}, "terminationMessagePath": "/dev/termination-log", "terminationMessagePolicy": "File", "imagePullPolicy": "IfNotPresent"}], "restartPolicy": "Always", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", "securityContext": {}, "schedulerName": "default-scheduler"}}, "strategy": {"type": "RollingUpdate", "rollingUpdate": {"maxUnavailable": "25%", "maxSurge": "25%"}}, "revisionHistoryLimit": 10, "progressDeadlineSeconds": 600}, "status": {"observedGeneration": 1, "replicas": 1, "updatedReplicas": 1, "readyReplicas": 1, "availableReplicas": 1, "conditions": [{"type": "Available", "status": "True", "lastUpdateTime": "2021-02-13T06:18:21Z", "lastTransitionTime": "2021-02-13T06:18:21Z", "reason": "MinimumReplicasAvailable", "message": "Deployment has minimum availability."}, {"type": "Progressing", "status": "True", "lastUpdateTime": "2021-02-13T06:18:21Z", "lastTransitionTime": "2021-02-13T06:18:09Z", "reason": "NewReplicaSetAvailable", "message": "ReplicaSet \"redis-cart-74594bd569\" has successfully progressed."}]}}, "runType": "scaler"}')
        spec = json.loads(sys.stdin.read())

        metric(spec, list_of_deployments, deployments_cpu)
    except Exception as err:
        sys.stderr.write(f"Error metric: {err}")
        exit(1)
Example #6
0
    def test_retry_on_error(self):  # noqa D102
        retry = Retry(total=3, backoff_factor=0.1, status_forcelist=[400])
        pc = PrometheusConnect(url=self.prometheus_host,
                               disable_ssl=True,
                               retry=retry)

        with self.assertRaises(requests.exceptions.RetryError,
                               msg="too many 400 error responses"):
            pc.custom_query("BOOM.BOOM!#$%")
Example #7
0
 def __init__(self, host, port, disablessl):
     if disablessl == True:
         self.schema = "http"
     else:
         self.schema = "https"
     try:
         self.prom = PrometheusConnect(url=self.schema + "://" + host +
                                       ":" + port,
                                       disable_ssl=disablessl)
     except Exception:
         print("Fehler")
Example #8
0
def pro():
    pc = PrometheusConnect(
        url="https://prometheus-k8s-openshift-monitoring.apps-crc.testing",
        headers={
            "Authorization":
            "bearer BSI2W0euoJWYRAvT0ZnSJVmgNQ87pl3o3yXuyy38qAg"
        },
        disable_ssl=True)
    up_metric = MetricsList(
        pc.get_current_metric_value(
            metric_name="haproxy_backend_up{exported_namespace='prophet'}"))
    print(up_metric[0])
Example #9
0
    def get_current(self) -> float:
        prom = PrometheusConnect(
            url=self.config.get("url", "http://localhost:9090"),
            disable_ssl=self.config.get("disable_ssl", True),
        )
        res = prom.custom_query(query=self.query)
        if not res:
            log.error("Prometheus query: no result")
            raise Exception("Prometheus query: no result")

        log.info(f"Prometheus query result: {res}")
        return float(res[0].get("value")[-1])
Example #10
0
def timed_job():
    config = configparser.ConfigParser()
    config.read('config/config.cfg')
    account = config.get('DEFAULT', 'ACCOUNT')
    key = config.get('DEFAULT', 'KEY')
    promi = config.get('DEFAULT', 'PROM')
    promup = promi.encode()
    container = config.get('DEFAULT', 'CONTAINER')
    url = config.get('DEFAULT', 'URL')
    blob_service = BlockBlobService(account_name=account, account_key=key)
    userAndPass = b64encode(promup).decode("ascii")
    headers = {'Authorization': 'Basic %s' % userAndPass}

    prom = PrometheusConnect(url=url, headers=headers, disable_ssl=False)
    metric_data = prom.all_metrics()

    time = datetime.now()
    metrics = []
    values = []

    for i in metric_data:
        metric = prom.get_metric_range_data(metric_name=i,
                                            start_time=time -
                                            timedelta(hours=1),
                                            end_time=time,
                                            chunk_size=timedelta(hours=1))
        x = int(0)
        for d in metric:
            for name, dct in d.items():
                dct = dict(dct)
                if name == 'metric':
                    dct['id'] = x
                    metrics.append(dct)
                else:
                    for key in dct:
                        va = {}
                        va['time'] = key
                        va['value'] = dct[key]
                        va['id'] = x
                        values.append(va)
                        x = x + 1

    df = pd.DataFrame(metrics)
    df1 = pd.DataFrame(values)
    df = pd.merge(df, df1, how='inner', left_on=['id'], right_on=['id'])
    df['time'] = pd.to_datetime(df['time'], unit='s')

    df = df.drop(['endpoint', 'service', 'id'], axis=1)
    write_pandas_dataframe_to_blob(
        blob_service, df, container,
        str((datetime.now()).date()) + '/' +
        str(datetime.now().time()).replace(':', '').replace(".", ''))
Example #11
0
 def __init__(self, docker_client_services_path,
              docker_server_services_path, ingress_distribution_file_path,
              docker_lb_container_path, service_list):
     self.docker_client_services = get_docker_services(
         docker_client_services_path)
     self.docker_server_services = get_docker_services(
         docker_server_services_path)
     self.get_ingress_distribution = get_docker_services(
         ingress_distribution_file_path)
     self.docker_lb_services = get_docker_services(docker_lb_container_path)
     self.prom = PrometheusConnect(url="http://131.155.35.54:9090",
                                   disable_ssl=True)
     self.capture_time = CAPTURE_TIME
     self.service_list = service_list
     pass
Example #12
0
class PromSummarizer(object):
    def __init__(self, url, disable_ssl=False):
        self.prom = PrometheusConnect(url=url, disable_ssl=disable_ssl)

    def fetch(self, expression, number_of_days):
        start_time = parse_datetime('%dd' % number_of_days)
        end_time = parse_datetime('now')
        chunk_size = parse_timedelta('now', '1d')

        metric_data = self.prom.get_metric_range_data(
            expression,
            start_time=start_time,
            end_time=end_time,
            chunk_size=chunk_size,
        )

        # MetricsList combines the chunks into a single metric
        metric = MetricsList(metric_data)[0]

        # Yield tuples of timestamp, value
        for value in metric.metric_values.values:
            ts, val = value.tolist()

            # The timestamp is delivered in UTC, convert to local
            ts = ts.to_pydatetime().replace(tzinfo=tz.tzutc())
            ts = ts.astimezone(tz.tzlocal())

            yield ts, val
Example #13
0
class Configuration:
    """Configuration of metrics-exporter."""

    # Prometheus
    URL = os.environ["PROMETHEUS_HOST_URL"]
    PROMETHEUS_SERVICE_ACCOUNT_TOKEN = os.environ[
        "PROMETHEUS_SERVICE_ACCOUNT_TOKEN"]
    HEADERS = {"Authorization": f"bearer {PROMETHEUS_SERVICE_ACCOUNT_TOKEN}"}
    PROM = PrometheusConnect(url=URL, disable_ssl=True, headers=HEADERS)

    # Namespaces
    THOTH_BACKEND_NAMESPACE = os.environ["THOTH_BACKEND_NAMESPACE"]
    THOTH_MIDDLETIER_NAMESPACE = os.environ["THOTH_MIDDLETIER_NAMESPACE"]
    THOTH_AMUN_INSPECTION_NAMESPACE = os.environ[
        "THOTH_AMUN_INSPECTION_NAMESPACE"]

    # Ceph
    CEPH_ACCESS_KEY_ID = os.environ["THOTH_CEPH_KEY_ID"]
    CEPH_ACCESS_SECRET_KEY = os.environ["THOTH_CEPH_SECRET_KEY"]
    CEPH_BUCKET_PREFIX = os.environ["THOTH_CEPH_BUCKET_PREFIX"]
    S3_ENDPOINT_URL = os.environ["THOTH_S3_ENDPOINT_URL"]
    CEPH_BUCKET = os.environ["THOTH_CEPH_BUCKET"]

    DEPLOYMENT_NAME = os.environ["THOTH_DEPLOYMENT_NAME"]

    # Kebechet
    GITHUB_ACCESS_TOKEN = os.environ["GITHUB_ACCESS_TOKEN"]
Example #14
0
    def client(self):
        verify = os.getenv('APIALCHEMY_PROMETHEUS_SSL_VERIFY',
                           'true').lower() == 'true'

        if not verify:
            urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

        return PrometheusConnect(**self._conn_params, disable_ssl=not verify)
Example #15
0
def profiling(url, pod_ip, ana_window='2m', metrics=MEM_UTIL):
    """if key exists, the value will be replaced,
       add dynamic status
       {ai.centaurus.io/gpu0:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:1},
        ai.centaurus.io/gpu1:{cur_mem_used:4GB, max_gpu_util:60, max_mem_cpy_util:34, cyclic:True, process_cnt:2, processes:[{pid:25678, cur_mem_used:3GB},{pid:67234, cur_mem_used:1GB}]}                                 
       }
    """
    ret_dict = dict()
    promi = PrometheusConnect(url=url, disable_ssl=True)
    # except connection error
    try:
        promi.check_prometheus_connection()
    except Exception as e:
        logging.error(e)
        return ret_dict  # if connectioin fails, return empty dict
    instance = pod_ip + ":9400" # tmp fixed
    start_time = parse_datetime(ana_window)
    end_time = parse_datetime("now")
    my_label_config = {"instance": instance}  # select current host metrics
    metric_data = promi.get_metric_range_data(metric_name=metrics,
                                              label_config=my_label_config,
                                              start_time=start_time,
                                              end_time=end_time)
    # reorganize data to label_config and metric_values
    metric_object_list = MetricsList(metric_data)
    ret_dict = dict()
    for item in metric_object_list: # iterate through all the gpus on the node
        if 'gpu' not in item.label_config: # handle metric config info exception
            continue
        id = item.label_config['gpu']  # predefined key from dcgm (gpu index)
        # ip = item.label_config['instance']
        key = DOMAIN + "/gpu-" + id
        cur_usage = collect_cur_usage(int(id))
        ts = item.metric_values.iloc[:, 1]  # metrics_values are two row df, 1st is timestamp, 2nd is value
        cur_usage['cyclic_pattern'] = False
        if ts.max() > 0:
            cyclic, period = cyclic_pattern_detection(ts)
            if cyclic:
                cur_usage['cyclic_pattern'] = True
                cur_usage['period'] = str(period)       
        cur_usage['max_mem_util'] = str(ts.max())
        # Important: flatten nested dictionary to string, otherwise error "cannot unmarshal string into Go value of type map[string]interface {}""
        ret_dict[key] = str(cur_usage)
    return ret_dict
Example #16
0
def update_saved_prom_metrics(metrics, save_dir):
    # connect to prometheus
    prom_url = os.getenv("FLT_PROM_URL",
                         "https://telemeter-lts.datahub.redhat.com/")
    prom_access_token = os.getenv("FLT_PROM_ACCESS_TOKEN")
    pc = PrometheusConnect(
        url=prom_url,
        headers={"Authorization": f"bearer {prom_access_token}"},
        disable_ssl=True,
    )

    # get metrics if avaiable
    if "cluster_operator_conditions" in metrics:
        conditions_df = metric_preprocessors.opconds_metrics_to_df(
            metrics_raw=pc.get_current_metric_value(
                "cluster_operator_conditions"))
    if "cluster_installer" in metrics:
        install_df = metric_preprocessors.installer_metrics_to_df(
            metrics_raw=pc.get_current_metric_value("cluster_installer"))
    if "cluster_version" in metrics:
        versions_df = metric_preprocessors.version_metrics_to_df(
            metrics_raw=pc.get_current_metric_value("cluster_version"))

    # combine all metrics
    metrics_df = conditions_df.merge(install_df,
                                     how="left",
                                     left_index=True,
                                     right_index=True)
    metrics_df = metrics_df.merge(versions_df,
                                  how="left",
                                  left_index=True,
                                  right_index=True)

    # nans because some install types are neither upi nor ipi (unknown)
    metrics_df["install_type_IPI"] = metrics_df["install_type_IPI"].fillna(0)
    metrics_df["install_type_UPI"] = metrics_df["install_type_UPI"].fillna(0)

    # save to volume
    metrics_df.to_parquet(
        fname=os.path.join(save_dir, "metrics.parquet"),
        engine="pyarrow",
        index=True,
    )
Example #17
0
class PrometheusClient:
    def __init__(self, promhost, promport):
        self.prom = PrometheusConnect(url="http://%s:%s" %
                                      (promhost, promport),
                                      disable_ssl=True)

    def get_ticktime(self):
        return self.__get_metric_for_last_five_mins("overall_ticktime")[0].get(
            "values")

    def get_dim_ticktime(self):
        result = {}
        dim_ticktimes = self.__get_metric_for_last_five_mins("dim_ticktime")
        for dimension in dim_ticktimes:
            result[dimension.get("metric").get(
                "dimension_name")] = dimension.get("values")
        return result

    def get_players(self):
        players = []
        for p in self.prom.custom_query("player_playtime"):
            players.append(p.get("metric").get("player"))
        return players

    def get_tps(self):
        return self.__get_metric_for_last_five_mins("overall_tps")[0].get(
            "values")

    def get_dim_tps(self):
        result = {}
        dim_tps = self.__get_metric_for_last_five_mins("dim_tps")
        for dimension in dim_tps:
            result[dimension.get("metric").get(
                "dimension_name")] = dimension.get("values")
        return result

    def __get_metric_for_last_five_mins(self, metricname):
        return self.prom.get_metric_range_data(
            metric_name=metricname,
            start_time=datetime.datetime.now() - datetime.timedelta(minutes=5),
            end_time=datetime.datetime.now(),
        )
Example #18
0
def queryMetrics(customquery, trim):
    # print("\n queryMetrics START\n")

    prom = PrometheusConnect(url ="http://localhost:9090", disable_ssl=True)

    data = prom.custom_query(query=customquery,
     )
    # To make it a table where each row is a metric
    df = MetricSnapshotDataFrame(data)
    df = df[df.value != "NaN"]

    df[['value']] = df[['value']].apply(pd.to_numeric)
    df[['timestamp']] = df[['timestamp']].apply(pd.to_datetime, unit='s')

    sortedDf = df.sort_values('value', ascending=False).head(trim)

    # print(nicenumbers)
    # print(df.index)
    # print(df.columns)
    # print("\n queryMetrics END\n")
    return sortedDf
def launch_prometheus():
    if kube_env.check_kubernetes_status() != util.EXIT_SUCCESS:
        log.error("Kubernetes is not set up."
                  " Did you run the deployment script?")
        sys.exit(util.EXIT_FAILURE)
    cmd = "kubectl get pods -n istio-system -lapp=prometheus "
    cmd += " -o jsonpath={.items[0].metadata.name}"
    prom_pod_name = util.get_output_from_proc(cmd).decode("utf-8")
    cmd = f"kubectl port-forward -n istio-system {prom_pod_name} 9090"
    prom_proc = util.start_process(cmd, preexec_fn=os.setsid)
    time.sleep(2)
    prom_api = PrometheusConnect(url="http://localhost:9090", disable_ssl=True)

    return prom_proc, prom_api
Example #20
0
class my_prometheus():
    def __init__(self, host, port, disablessl):
        if disablessl == True:
            self.schema = "http"
        else:
            self.schema = "https"
        try:
            self.prom = PrometheusConnect(url=self.schema + "://" + host +
                                          ":" + port,
                                          disable_ssl=disablessl)
        except Exception:
            print("Fehler")

    def prom_query(self, query):
        self.lasttemps = self.prom.custom_query(query=query)
        self.lasttemp = sorted(self.lasttemps[0]["values"], reverse=True)[0][1]
Example #21
0
def check_database_metrics_availability(configuration: Configuration) -> bool:
    """Check database metrics (Prometheus/Thanos) availability."""
    pc = PrometheusConnect(
        url=configuration.thanos_url,
        headers={"Authorization": f"bearer {configuration.thanos_token}"},
        disable_ssl=True,
    )
    response = pc._session.get(
        "{0}/".format(pc.url),
        verify=pc.ssl_verification,
        headers=pc.headers,
        params={},
    )
    if not response.ok:
        return False

    return True
Example #22
0
def build_reports(timestamp, config, es_url, thanos_url, grafana_url, target_index):
    es_client = Elasticsearch(es_url)
    thanos_client = PrometheusConnect(thanos_url, disable_ssl=True)
    clusters, docs = collect.get_clusters(es_client, timestamp, indices=config['searchIndices'])
    reports = []
    for cluster in clusters:
        benchmarks = collect.get_benchmarks_for_cluster(cluster['cluster_name'], docs, config['ignoreTags'])
        for benchmark in benchmarks:
            report = {
                **cluster,
                'report_type': 'podLatency',
                'metadata': benchmark['metadata'],
                "results": collect.get_benchmark_results(benchmark, es_client)
            }

            if report['results'] != {}:
                print(f"cluster {report['cluster_name']} has results")
                reports.append(report)
    
 
    for report in enrich.enrich_reports(reports, grafana_url, thanos_client, config):
        response = index.index_report(es_client, report, target_index)
        print(response) 
Example #23
0
import tornado.ioloop
import tornado.web
from tornado.httpserver import HTTPServer
from prometheus_client import Gauge, generate_latest, REGISTRY
from prometheus_api_client import PrometheusConnect, Metric
from configuration import Configuration
from graph_handler import GraphHandler
import schedule

_LOGGER = logging.getLogger(__name__)

PREDICTOR_MODEL_LIST = list()

pc = PrometheusConnect(
    url=Configuration.prometheus_url,
    headers=Configuration.prometheus_headers,
    disable_ssl=True,
)

for metric in Configuration.metrics_list:
    metric_init = pc.get_current_metric_value(metric_name=metric)

    for unique_metric in metric_init:
        PREDICTOR_MODEL_LIST.append(
            Configuration.algorithm(
                unique_metric,
                rolling_data_window_size=Configuration.
                rolling_training_window_size,
            ))

GAUGE_DICT = dict()
class TestPrometheusConnect(unittest.TestCase):
    """
    Test module for class PrometheusConnect
    """
    def setUp(self):
        """
        set up connection settings for prometheus
        """
        self.prometheus_host = os.getenv("PROM_URL")
        self.pc = PrometheusConnect(url=self.prometheus_host, disable_ssl=True)

    def test_metrics_list(self):
        """
        Check if setup was done correctly
        """
        metrics_list = self.pc.all_metrics()
        self.assertTrue(
            len(metrics_list) > 0, "no metrics received from prometheus")

    def test_get_metric_range_data(self):
        start_time = datetime.now() - timedelta(minutes=10)
        end_time = datetime.now()
        metric_data = self.pc.get_metric_range_data(metric_name="up",
                                                    start_time=start_time,
                                                    end_time=end_time)

        metric_objects_list = MetricsList(metric_data)

        self.assertTrue(
            len(metric_objects_list) > 0,
            "no metrics received from prometheus")
        self.assertTrue(
            start_time.timestamp() <
            metric_objects_list[0].start_time.timestamp(),
            "invalid metric start time",
        )
        self.assertTrue(
            (start_time + timedelta(minutes=1)).timestamp() >
            metric_objects_list[0].start_time.timestamp(),
            "invalid metric start time",
        )
        self.assertTrue(
            end_time.timestamp() > metric_objects_list[0].end_time.timestamp(),
            "invalid metric end time",
        )
        self.assertTrue(
            (end_time - timedelta(minutes=1)).timestamp() <
            metric_objects_list[0].end_time.timestamp(),
            "invalid metric end time",
        )

    def test_get_metric_range_data_with_chunk_size(self):
        start_time = datetime.now() - timedelta(minutes=65)
        chunk_size = timedelta(minutes=7)
        end_time = datetime.now() - timedelta(minutes=5)
        metric_data = self.pc.get_metric_range_data(metric_name="up",
                                                    start_time=start_time,
                                                    end_time=end_time,
                                                    chunk_size=chunk_size)

        metric_objects_list = MetricsList(metric_data)

        self.assertTrue(
            len(metric_objects_list) > 0,
            "no metrics received from prometheus")
        self.assertTrue(
            start_time.timestamp() <
            metric_objects_list[0].start_time.timestamp(),
            "invalid metric start time (with given chunk_size)",
        )
        self.assertTrue(
            (start_time + timedelta(minutes=1)).timestamp() >
            metric_objects_list[0].start_time.timestamp(),
            "invalid metric start time (with given chunk_size)",
        )
        self.assertTrue(
            end_time.timestamp() > metric_objects_list[0].end_time.timestamp(),
            "invalid metric end time (with given chunk_size)",
        )
        self.assertTrue(
            (end_time - timedelta(minutes=1)).timestamp() <
            metric_objects_list[0].end_time.timestamp(),
            "invalid metric end time (with given chunk_size)",
        )

    def test_get_metric_range_data_with_incorrect_input_types(self):
        start_time = datetime.now() - timedelta(minutes=20)
        chunk_size = timedelta(minutes=7)
        end_time = datetime.now() - timedelta(minutes=10)

        with self.assertRaises(TypeError,
                               msg="start_time accepted invalid value type"):
            _ = self.pc.get_metric_range_data(metric_name="up",
                                              start_time="20m",
                                              end_time=end_time,
                                              chunk_size=chunk_size)
        with self.assertRaises(TypeError,
                               msg="end_time accepted invalid value type"):
            _ = self.pc.get_metric_range_data(metric_name="up",
                                              start_time=start_time,
                                              end_time="10m",
                                              chunk_size=chunk_size)
        with self.assertRaises(TypeError,
                               msg="chunk_size accepted invalid value type"):
            _ = self.pc.get_metric_range_data(metric_name="up",
                                              start_time=start_time,
                                              end_time=end_time,
                                              chunk_size="10m")
 def setUp(self):
     """
     set up connection settings for prometheus
     """
     self.prometheus_host = os.getenv("PROM_URL")
     self.pc = PrometheusConnect(url=self.prometheus_host, disable_ssl=True)
 def setUp(self):
     self.pc = PrometheusConnect(url='http://doesnt_matter.xyz',
                                 disable_ssl=True)
class TestPrometheusConnectWithMockedNetwork(BaseMockedNetworkTestcase):
    """
    Network is blocked in this testcase, see base class
    """
    def setUp(self):
        self.pc = PrometheusConnect(url='http://doesnt_matter.xyz',
                                    disable_ssl=True)

    def test_network_is_blocked(self):
        resp = requests.get('https://google.com')
        self.assertEqual(resp.status_code, 403)
        self.assertEqual(resp.text, 'BOOM!')

    def test_how_mock_prop_works(self):
        with self.mock_response('kekekeke', status_code=500) as handler:
            self.assertEqual(len(handler.requests), 0)
            resp = requests.get('https://redhat.com')
            self.assertEqual(resp.status_code, 500)
            self.assertEqual(resp.text, 'kekekeke')

            self.assertEqual(len(handler.requests), 1)
            request = handler.requests[0]
            self.assertEqual(request.url, 'https://redhat.com/')

    def test_unauthorized(self):
        with self.mock_response("Unauthorized", status_code=403):
            with self.assertRaises(PrometheusApiClientException) as exc:
                self.pc.all_metrics()
        self.assertEqual("HTTP Status Code 403 (b'Unauthorized')",
                         str(exc.exception))

    def test_broken_responses(self):
        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.all_metrics()
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.get_current_metric_value("metric")
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.get_metric_range_data("metric")
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.custom_query_range("query", datetime.now(), datetime.now(),
                                       "1")
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.custom_query("query")
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

    def test_all_metrics_method(self):
        all_metrics_payload = {"status": "success", "data": ["up", "alerts"]}

        with self.mock_response(all_metrics_payload) as handler:
            self.assertTrue(len(self.pc.all_metrics()))
            self.assertEqual(handler.call_count, 1)
            request = handler.requests[0]
            self.assertEqual(request.path_url, "/api/v1/label/__name__/values")
Example #28
0
 def __init__(self, nodes: NodeDataView):
     super().__init__(nodes)
     self._prom = PrometheusConnect(url=settings.prometheus.url)
Example #29
0
class PrometheusMetricProvider(MetricProvider):
    def __init__(self, nodes: NodeDataView):
        super().__init__(nodes)
        self._prom = PrometheusConnect(url=settings.prometheus.url)

    def get_metric(self, metric: PrometheusMetric) -> List:
        try:
            return self.__prom_request(metric.query)
        except PrometheusApiClientException as e:
            logger.error(f"Error pulling {metric}: {e}")
        return []

    def __prom_request(self, query: str) -> List:
        return self._prom.custom_query(query)

    def refresh_data(self):
        logger.debug("Pulling metrics from Prometheus")
        self._data["messages_in"] = self.__get_messages_in()
        self._data["messages_out"] = self.__get_messages_out()
        self._data["consumer_lag"] = self.__get_consumer_lag()
        self._data["consumer_read_rate"] = self.__get_consumer_read_rate()
        self._data["topic_size"] = self.__get_topic_size()
        self._data["replicas"] = self.__get_replicas()
        self._data["connector_tasks"] = self.__get_connector_tasks()

    def __get_messages_in(self) -> Dict[str, float]:
        prom_messages_in = self.get_metric(metric=PrometheusMetric.MESSAGES_IN)
        return {
            d["metric"]["topic"]: round(float(d["value"][-1]), 2)
            for d in prom_messages_in
        }

    def __get_messages_out(self) -> Dict[str, float]:
        prom_messages_out = self.get_metric(
            metric=PrometheusMetric.MESSAGES_OUT)
        return {
            d["metric"]["topic"]: round(float(d["value"][-1]), 2)
            for d in prom_messages_out
        }

    def __get_consumer_lag(self) -> Dict[str, int]:
        prom_consumer_lag = self.get_metric(
            metric=PrometheusMetric.CONSUMER_LAG)
        return {
            d["metric"]["group"]: int(d["value"][-1])
            for d in prom_consumer_lag
        }

    def __get_consumer_read_rate(self) -> Dict[str, float]:
        prom_consumer_read_rate = self.get_metric(
            metric=PrometheusMetric.CONSUMER_READ_RATE)
        return {
            d["metric"]["group"]: float(d["value"][-1])
            for d in prom_consumer_read_rate
        }

    def __get_topic_size(self) -> Dict[str, int]:
        prom_topic_size = self.get_metric(metric=PrometheusMetric.TOPIC_SIZE)
        return {
            d["metric"]["topic"]: int(d["value"][-1])
            for d in prom_topic_size
        }

    def __get_replicas(self) -> Dict[str, int]:
        prom_replicas = self.get_metric(metric=PrometheusMetric.REPLICAS)
        return {
            d["metric"]["deployment"]: int(d["value"][-1])
            for d in prom_replicas
        }

    def __get_connector_tasks(self) -> Dict[str, int]:
        prom_connector_tasks = self.get_metric(
            metric=PrometheusMetric.CONNECTOR_TASKS)
        return {
            d["metric"]["connector"]: int(d["value"][-1])
            for d in prom_connector_tasks
        }
Example #30
0
class get_prometheus_data:
    def __init__(self, action):

        self.sample_info_dict = action
        self.uuid = action["uuid"]
        self.user = action["user"]
        self.cluster_name = action["cluster_name"]
        self.test_config = action["test_config"]

        # change datetime in seconds string to datetime object
        starttime = datetime.fromtimestamp(
            int(self.sample_info_dict["starttime"]))
        self.start = starttime

        # change datetime in seconds string to datetime object
        endtime = datetime.fromtimestamp(int(self.sample_info_dict["endtime"]))
        self.end = endtime

        # step value to be used in prometheus query
        # default is 30 seconds(openshift default scraping interval)
        # but can be overridden with env
        if "prom_step" in os.environ:
            self.T_Delta = os.environ["prom_step"]
        else:
            self.T_Delta = 30

        self.get_data = False
        if "prom_token" in os.environ and "prom_url" in os.environ:
            self.get_data = True
            token = os.environ["prom_token"]
            self.url = os.environ["prom_url"]
            bearer = "Bearer " + token
            self.headers = {"Authorization": bearer}
            self.pc = PrometheusConnect(url=self.url,
                                        headers=self.headers,
                                        disable_ssl=True)
        else:
            logger.warn(
                """snafu service account token and prometheus url not set \n
                        No Prometheus data will be indexed""")

    def get_all_metrics(self):

        # check get_data bool, if false by-pass all processing
        if self.get_data:
            start_time = time.time()

            # resolve directory  the tool include file
            dirname = os.path.dirname(os.path.realpath(__file__))
            include_file_dir = os.path.join(dirname, "prometheus_labels/")
            tool_include_file = include_file_dir + self.sample_info_dict[
                "tool"] + "_included_labels.json"

            # check if tools include file is there
            # if not use the default include file
            if os.path.isfile(tool_include_file):
                filename = tool_include_file
            else:
                filename = os.path.join(include_file_dir,
                                        "included_labels.json")
            logger.info("using prometheus metric include file %s" % filename)

            # open tools include file and loop through all
            with open(filename, "r") as f:
                datastore = json.load(f)

            for metric_name in datastore["data"]:

                query_item = datastore["data"][metric_name]
                query = query_item["query"]
                label = query_item["label"]

                step = str(self.T_Delta) + "s"
                try:
                    # Execute custom query to pull the desired labels between X and Y time.
                    response = self.pc.custom_query_range(
                        query, self.start, self.end, step, None)

                except Exception as e:
                    # response undefined at this point, we want to skip next for loop
                    response = []
                    logger.info(query)
                    logger.warn("failure to get metric results %s" % e)

                for result in response:
                    # clean up name key from __name__ to name
                    result["metric"]["name"] = ""
                    if "__name__" in result["metric"]:
                        result["metric"]["name"] = result["metric"]["__name__"]
                        del result["metric"]["__name__"]
                    else:
                        result["metric"]["name"] = label
                    # each result has a list, we must flatten it out in order to send to ES
                    for value in result["values"]:
                        # fist index is time stamp
                        timestamp = datetime.utcfromtimestamp(
                            value[0]).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
                        # second index is value of metric
                        if "NaN" in value[
                                1]:  # need to handle values that are NaN, Inf, or -Inf
                            metric_value = 0
                        else:
                            metric_value = float(value[1])

                        flat_doc = {
                            "metric": result["metric"],
                            "Date": timestamp,
                            "value": metric_value,
                            "metric_name": metric_name,
                        }

                        flat_doc.update(self.sample_info_dict)
                        yield flat_doc

            logger.debug("Total Time --- %s seconds ---" %
                         (time.time() - start_time))