Example #1
0
def timed_job():
    config = configparser.ConfigParser()
    config.read('config/config.cfg')
    account = config.get('DEFAULT', 'ACCOUNT')
    key = config.get('DEFAULT', 'KEY')
    promi = config.get('DEFAULT', 'PROM')
    promup = promi.encode()
    container = config.get('DEFAULT', 'CONTAINER')
    url = config.get('DEFAULT', 'URL')
    blob_service = BlockBlobService(account_name=account, account_key=key)
    userAndPass = b64encode(promup).decode("ascii")
    headers = {'Authorization': 'Basic %s' % userAndPass}

    prom = PrometheusConnect(url=url, headers=headers, disable_ssl=False)
    metric_data = prom.all_metrics()

    time = datetime.now()
    metrics = []
    values = []

    for i in metric_data:
        metric = prom.get_metric_range_data(metric_name=i,
                                            start_time=time -
                                            timedelta(hours=1),
                                            end_time=time,
                                            chunk_size=timedelta(hours=1))
        x = int(0)
        for d in metric:
            for name, dct in d.items():
                dct = dict(dct)
                if name == 'metric':
                    dct['id'] = x
                    metrics.append(dct)
                else:
                    for key in dct:
                        va = {}
                        va['time'] = key
                        va['value'] = dct[key]
                        va['id'] = x
                        values.append(va)
                        x = x + 1

    df = pd.DataFrame(metrics)
    df1 = pd.DataFrame(values)
    df = pd.merge(df, df1, how='inner', left_on=['id'], right_on=['id'])
    df['time'] = pd.to_datetime(df['time'], unit='s')

    df = df.drop(['endpoint', 'service', 'id'], axis=1)
    write_pandas_dataframe_to_blob(
        blob_service, df, container,
        str((datetime.now()).date()) + '/' +
        str(datetime.now().time()).replace(':', '').replace(".", ''))
class TestPrometheusConnect(unittest.TestCase):
    """
    Test module for class PrometheusConnect
    """
    def setUp(self):
        """
        set up connection settings for prometheus
        """
        self.prometheus_host = os.getenv("PROM_URL")
        self.pc = PrometheusConnect(url=self.prometheus_host, disable_ssl=True)

    def test_metrics_list(self):
        """
        Check if setup was done correctly
        """
        metrics_list = self.pc.all_metrics()
        self.assertTrue(
            len(metrics_list) > 0, "no metrics received from prometheus")

    def test_get_metric_range_data(self):
        start_time = datetime.now() - timedelta(minutes=10)
        end_time = datetime.now()
        metric_data = self.pc.get_metric_range_data(metric_name="up",
                                                    start_time=start_time,
                                                    end_time=end_time)

        metric_objects_list = MetricsList(metric_data)

        self.assertTrue(
            len(metric_objects_list) > 0,
            "no metrics received from prometheus")
        self.assertTrue(
            start_time.timestamp() <
            metric_objects_list[0].start_time.timestamp(),
            "invalid metric start time",
        )
        self.assertTrue(
            (start_time + timedelta(minutes=1)).timestamp() >
            metric_objects_list[0].start_time.timestamp(),
            "invalid metric start time",
        )
        self.assertTrue(
            end_time.timestamp() > metric_objects_list[0].end_time.timestamp(),
            "invalid metric end time",
        )
        self.assertTrue(
            (end_time - timedelta(minutes=1)).timestamp() <
            metric_objects_list[0].end_time.timestamp(),
            "invalid metric end time",
        )

    def test_get_metric_range_data_with_chunk_size(self):
        start_time = datetime.now() - timedelta(minutes=65)
        chunk_size = timedelta(minutes=7)
        end_time = datetime.now() - timedelta(minutes=5)
        metric_data = self.pc.get_metric_range_data(metric_name="up",
                                                    start_time=start_time,
                                                    end_time=end_time,
                                                    chunk_size=chunk_size)

        metric_objects_list = MetricsList(metric_data)

        self.assertTrue(
            len(metric_objects_list) > 0,
            "no metrics received from prometheus")
        self.assertTrue(
            start_time.timestamp() <
            metric_objects_list[0].start_time.timestamp(),
            "invalid metric start time (with given chunk_size)",
        )
        self.assertTrue(
            (start_time + timedelta(minutes=1)).timestamp() >
            metric_objects_list[0].start_time.timestamp(),
            "invalid metric start time (with given chunk_size)",
        )
        self.assertTrue(
            end_time.timestamp() > metric_objects_list[0].end_time.timestamp(),
            "invalid metric end time (with given chunk_size)",
        )
        self.assertTrue(
            (end_time - timedelta(minutes=1)).timestamp() <
            metric_objects_list[0].end_time.timestamp(),
            "invalid metric end time (with given chunk_size)",
        )

    def test_get_metric_range_data_with_incorrect_input_types(self):
        start_time = datetime.now() - timedelta(minutes=20)
        chunk_size = timedelta(minutes=7)
        end_time = datetime.now() - timedelta(minutes=10)

        with self.assertRaises(TypeError,
                               msg="start_time accepted invalid value type"):
            _ = self.pc.get_metric_range_data(metric_name="up",
                                              start_time="20m",
                                              end_time=end_time,
                                              chunk_size=chunk_size)
        with self.assertRaises(TypeError,
                               msg="end_time accepted invalid value type"):
            _ = self.pc.get_metric_range_data(metric_name="up",
                                              start_time=start_time,
                                              end_time="10m",
                                              chunk_size=chunk_size)
        with self.assertRaises(TypeError,
                               msg="chunk_size accepted invalid value type"):
            _ = self.pc.get_metric_range_data(metric_name="up",
                                              start_time=start_time,
                                              end_time=end_time,
                                              chunk_size="10m")
class TestPrometheusConnectWithMockedNetwork(BaseMockedNetworkTestcase):
    """
    Network is blocked in this testcase, see base class
    """
    def setUp(self):
        self.pc = PrometheusConnect(url='http://doesnt_matter.xyz',
                                    disable_ssl=True)

    def test_network_is_blocked(self):
        resp = requests.get('https://google.com')
        self.assertEqual(resp.status_code, 403)
        self.assertEqual(resp.text, 'BOOM!')

    def test_how_mock_prop_works(self):
        with self.mock_response('kekekeke', status_code=500) as handler:
            self.assertEqual(len(handler.requests), 0)
            resp = requests.get('https://redhat.com')
            self.assertEqual(resp.status_code, 500)
            self.assertEqual(resp.text, 'kekekeke')

            self.assertEqual(len(handler.requests), 1)
            request = handler.requests[0]
            self.assertEqual(request.url, 'https://redhat.com/')

    def test_unauthorized(self):
        with self.mock_response("Unauthorized", status_code=403):
            with self.assertRaises(PrometheusApiClientException) as exc:
                self.pc.all_metrics()
        self.assertEqual("HTTP Status Code 403 (b'Unauthorized')",
                         str(exc.exception))

    def test_broken_responses(self):
        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.all_metrics()
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.get_current_metric_value("metric")
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.get_metric_range_data("metric")
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.custom_query_range("query", datetime.now(), datetime.now(),
                                       "1")
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

        with self.assertRaises(PrometheusApiClientException) as exc:
            self.pc.custom_query("query")
        self.assertEqual("HTTP Status Code 403 (b'BOOM!')", str(exc.exception))

    def test_all_metrics_method(self):
        all_metrics_payload = {"status": "success", "data": ["up", "alerts"]}

        with self.mock_response(all_metrics_payload) as handler:
            self.assertTrue(len(self.pc.all_metrics()))
            self.assertEqual(handler.call_count, 1)
            request = handler.requests[0]
            self.assertEqual(request.path_url, "/api/v1/label/__name__/values")
Example #4
0
class TestPrometheusConnect(unittest.TestCase):
    """Test module for class PrometheusConnect."""
    def setUp(self):
        """Set up connection settings for prometheus."""
        self.prometheus_host = os.getenv("PROM_URL")
        self.pc = PrometheusConnect(url=self.prometheus_host, disable_ssl=True)

    def test_metrics_list(self):
        """Check if setup was done correctly."""
        metrics_list = self.pc.all_metrics()
        self.assertTrue(
            len(metrics_list) > 0, "no metrics received from prometheus")

    def test_get_metric_range_data(self):  # noqa D102
        start_time = datetime.now() - timedelta(minutes=10)
        end_time = datetime.now()
        metric_data = self.pc.get_metric_range_data(metric_name="up",
                                                    start_time=start_time,
                                                    end_time=end_time)

        metric_objects_list = MetricsList(metric_data)

        self.assertTrue(
            len(metric_objects_list) > 0,
            "no metrics received from prometheus")
        self.assertTrue(
            start_time.timestamp() <
            metric_objects_list[0].start_time.timestamp(),
            "invalid metric start time",
        )
        self.assertTrue(
            (start_time + timedelta(minutes=1)).timestamp() >
            metric_objects_list[0].start_time.timestamp(),
            "invalid metric start time",
        )
        self.assertTrue(
            end_time.timestamp() > metric_objects_list[0].end_time.timestamp(),
            "invalid metric end time",
        )
        self.assertTrue(
            (end_time - timedelta(minutes=1)).timestamp() <
            metric_objects_list[0].end_time.timestamp(),
            "invalid metric end time",
        )

    def test_get_metric_range_data_with_chunk_size(self):  # noqa D102
        start_time = datetime.now() - timedelta(minutes=65)
        chunk_size = timedelta(minutes=7)
        end_time = datetime.now() - timedelta(minutes=5)
        metric_data = self.pc.get_metric_range_data(metric_name="up",
                                                    start_time=start_time,
                                                    end_time=end_time,
                                                    chunk_size=chunk_size)

        metric_objects_list = MetricsList(metric_data)

        self.assertTrue(
            len(metric_objects_list) > 0,
            "no metrics received from prometheus")
        self.assertTrue(
            start_time.timestamp() <
            metric_objects_list[0].start_time.timestamp(),
            "invalid metric start time (with given chunk_size)",
        )
        self.assertTrue(
            (start_time + timedelta(minutes=1)).timestamp() >
            metric_objects_list[0].start_time.timestamp(),
            "invalid metric start time (with given chunk_size)",
        )
        self.assertTrue(
            end_time.timestamp() > metric_objects_list[0].end_time.timestamp(),
            "invalid metric end time (with given chunk_size)",
        )
        self.assertTrue(
            (end_time - timedelta(minutes=1)).timestamp() <
            metric_objects_list[0].end_time.timestamp(),
            "invalid metric end time (with given chunk_size)",
        )

    def test_get_metric_range_data_with_incorrect_input_types(
            self):  # noqa D102
        start_time = datetime.now() - timedelta(minutes=20)
        chunk_size = timedelta(minutes=7)
        end_time = datetime.now() - timedelta(minutes=10)

        with self.assertRaises(ValueError,
                               msg="specified chunk_size is too big"):
            _ = self.pc.get_metric_range_data(
                metric_name="up",
                start_time=start_time,
                end_time=end_time,
                chunk_size=timedelta(minutes=30),
            )
        with self.assertRaises(TypeError,
                               msg="start_time accepted invalid value type"):
            _ = self.pc.get_metric_range_data(metric_name="up",
                                              start_time="20m",
                                              end_time=end_time,
                                              chunk_size=chunk_size)
        with self.assertRaises(TypeError,
                               msg="end_time accepted invalid value type"):
            _ = self.pc.get_metric_range_data(metric_name="up",
                                              start_time=start_time,
                                              end_time="10m",
                                              chunk_size=chunk_size)
        with self.assertRaises(TypeError,
                               msg="chunk_size accepted invalid value type"):
            _ = self.pc.get_metric_range_data(metric_name="up",
                                              start_time=start_time,
                                              end_time=end_time,
                                              chunk_size="10m")

    def test_get_metric_aggregation(self):  # noqa D102
        operations = [
            "sum", "max", "min", "variance", "percentile_50", "deviation",
            "average"
        ]
        start_time = datetime.now() - timedelta(minutes=10)
        end_time = datetime.now()
        step = "15"
        aggregated_values = self.pc.get_metric_aggregation(
            query="up",
            operations=operations,
            start_time=start_time,
            end_time=end_time,
            step=step)

        self.assertTrue(
            len(aggregated_values) > 0, "no values received after aggregating")

    def test_get_metric_aggregation_with_incorrect_input_types(
            self):  # noqa D102
        with self.assertRaises(TypeError,
                               msg="operations accepted invalid value type"):
            _ = self.pc.get_metric_aggregation(query="up", operations="sum")

    def test_retry_on_error(self):  # noqa D102
        retry = Retry(total=3, backoff_factor=0.1, status_forcelist=[400])
        pc = PrometheusConnect(url=self.prometheus_host,
                               disable_ssl=True,
                               retry=retry)

        with self.assertRaises(requests.exceptions.RetryError,
                               msg="too many 400 error responses"):
            pc.custom_query("BOOM.BOOM!#$%")
Example #5
0
def get_all_metrics(start_time='5m', end_time='now', instance='', gpu_id=''):
    """
    all DCGM metrics, on all instances, and all gpus
    save dumped data to csv file
    """
    # save the time first, in case multiple query at different time later
    start_time = parse_datetime(start_time)
    end_time = parse_datetime(end_time)
    # connect to premtheus server, exit if connection fails
    url = "http://prometheus:9090"  # use service name, instead of ip to be more robust
    prom = PrometheusConnect(url=url, disable_ssl=True)
    try:
        prom.check_prometheus_connection()
    except Exception as e:
        logging.error(e)
        exit(1)
    # get all metrics under profiler job, note: some instances/gpus may not have all the metrics due to model variance
    metrics = prom.all_metrics()
    metrics = [a for a in metrics if 'DCGM' in a]
    gpu_util = 'DCGM_FI_DEV_GPU_UTIL'
    label_cfg = {"job": "profiler-pods"}
    # get a screenshot of all the instances (pod ip)
    metric_data = prom.get_current_metric_value(metric_name=gpu_util,
                                                label_config=label_cfg)
    metric_df = MetricSnapshotDataFrame(metric_data)
    instances = metric_df.instance.unique()
    ins_gpu = dict()
    for ins in instances:
        # add instance in query
        label_cfg['instance'] = ins
        metric_data = prom.get_current_metric_value(metric_name=gpu_util,
                                                    label_config=label_cfg)
        metric_df = MetricSnapshotDataFrame(metric_data)
        gpus = metric_df.gpu.unique()
        # put each instance's gpus into dictionary
        ins_gpu[ins] = gpus

    my_label_config = {"job": "profiler-pods", "gpu": gpu_id}  # select gpu0
    #my_label_config = {"instance": instance}  # select all gpu
    # if one particular instance is given, update instances
    if instance != '':
        instances = [
            instance,
        ]
    for ins in instances:
        if gpu_id != '':
            gpus = [
                gpu_id,
            ]
        else:
            gpus = ins_gpu[ins]
            print(ins, gpus)
        for gpu in gpus:
            my_label_config = {"instance": ins, "gpu": gpu}
            df = pd.DataFrame()
            for metric_name in metrics:
                # select from different metric_name to query
                metric_data = prom.get_metric_range_data(
                    metric_name=metric_name,
                    label_config=my_label_config,
                    start_time=parse_datetime(start_time),
                    end_time=parse_datetime(end_time))

                # reorganize data to label_config and metric_values
                metric_object_list = MetricsList(metric_data)
                if len(metric_object_list) > 0:
                    if 'datetime' not in df.columns:
                        df['datetime'] = metric_object_list[0].metric_values[
                            'ds']
                    df[metric_name] = metric_object_list[0].metric_values['y']

            file_name = "_".join([ins, gpu]) + ".csv"
            df.to_csv(file_name)