def _get_metrics_from_prometheus(self, observer=None): # Collect credentials to connect to a prometheus instance prom_token = os.getenv("PROM_ACCESS_TOKEN") prom_url = os.getenv("PROM_URL") if not (prom_token or prom_url): sys.exit("Error: Prometheus credentials not found") prom = Prometheus(url=prom_url, token=prom_token, data_chunk='5m',stored_data='5m') metrics_list = prom.all_metrics() # Get a list of all the metrics available from Prometheus print("Polling Prometheus for new metric data") metric_data = dict() if observer: for metric in metrics_list: pkt = ((prom.get_metric(name=metric))[0]) metric_data[metric] = pkt observer.on_next(pkt) # push metric data to the Observer pass else: for metric in metrics_list: metric_data[metric] = ((prom.get_metric(name=metric))[0]) return(metric_data)
def setUp(self): port = os.environ.get("PROM_PORT", "9990") self.port_forward = subprocess.Popen([ 'kubectl', '-n', 'istio-system', 'port-forward', 'deployment/prometheus', '%s:9090' % port ], stdout=subprocess.PIPE) self.port_forward.stdout.readline( ) # Wait for port forward to be ready self.prom = Prometheus('http://localhost:%s/' % port)
def setUpClass(self): port = os.environ.get("PROM_PORT", "9990") namespace, deployment = find_prometheus() self.port_forward = subprocess.Popen([ 'kubectl', '-n', namespace, 'port-forward', deployment, '%s:9090' % port ], stdout=subprocess.PIPE) self.port_forward.stdout.readline( ) # Wait for port forward to be ready self.prom = Prometheus('http://localhost:%s/' % port)
class TestAlarms(unittest.TestCase): def test_graceful_shutdown(self): queries = [ Query( 'Graceful Shutdown: 5xx Requests/s', 'sum(rate(istio_requests_total{destination_service="httpbin.graceful-shutdown.svc.cluster.local", source_app="client", response_code=~"5.."}[10m]))', Alarm( lambda error_rate: error_rate > 0, 'There were 5xx errors. Requests may be getting dropped.') ), Query( 'Graceful Shutdown: Total Requests/s', 'sum(rate(istio_requests_total{destination_service="httpbin.graceful-shutdown.svc.cluster.local", source_app="client"}[10m]))', Alarm( lambda qps: qps < 18, 'Not enough requests sent; expect at least 18. Service may be having issues.' )), ] self.run_queries(queries) def test_external_traffic(self): queries = [ Query( 'External Traffic: Total requests', 'sum(rate(istio_requests_total{destination_service="fortio-server.allow-external-traffic-b.svc.cluster.local"}[10m]))', Alarm( lambda qps: qps < 250, 'Not enough requests sent; expect at least 250. Service may be having issues.' )) # Cross namespace metrics are not recorded ] self.run_queries(queries) def setUp(self): port = os.environ.get("PROM_PORT", "9990") self.port_forward = subprocess.Popen([ 'kubectl', '-n', 'istio-system', 'port-forward', 'deployment/prometheus', '%s:9090' % port ], stdout=subprocess.PIPE) self.port_forward.stdout.readline( ) # Wait for port forward to be ready self.prom = Prometheus('http://localhost:%s/' % port) def tearDown(self): self.port_forward.terminate() def run_queries(self, queries): for query in queries: with self.subTest(name=query.description): errors = self.prom.run_query(query, debug=True) message = 'Alarms Triggered:' for e in errors: message += '\n- ' + e assert_empty(errors, message)
def config_push_converge_query(prom: Prometheus, svc: str = "svc-0", namespace: str = 'pilot-load'): cluster_name = 'outbound|890||{0}.{1}.svc.cluster.local'.format( svc, namespace ) result = prom.fetch_by_query( 'count(envoy_cluster_upstream_cx_total{cluster_name=~".*pilot-load.*"}) by (cluster_name)') if not result: return [] return [(point['metric'], point['value'][1]) for point in result['data']['result']]
def observe_prom_metrics_range(self, observer, metrics_list, start_time, end_time='now', chunk_size='1h'): # Collect credentials to connect to a prometheus instance prom_token = os.getenv("FLT_PROM_ACCESS_TOKEN") prom_url = os.getenv("FLT_PROM_URL") if not (prom_token or prom_url): sys.exit("Error: Prometheus credentials not found") prom = Prometheus(url=prom_url, token=prom_token) # Calculate chunk size to download and push to the observer at each instance chunk_seconds = int( round((dateparser.parse('now') - dateparser.parse(chunk_size)).total_seconds())) print( "\nCollecting metric data within datetime range:{0} - {1}".format( dateparser.parse(start_time), dateparser.parse(end_time))) start = dateparser.parse(start_time).timestamp() end = dateparser.parse(end_time).timestamp() while start < end: # Main loop which iterates through time-ranges to collect a chunk of data at every iteration for metric_name in metrics_list: # Loop to get a chunk of data for every metric in the list print( "Current Chunk Info: Metric = {0}, Time range = {1} - {2}". format(metric_name, dateparser.parse(str(start)), dateparser.parse(str(start + chunk_seconds)))) pkt_list = (prom.get_metric_range_data(metric_name=metric_name, start_time=start, end_time=start + chunk_seconds)) for pkt in pkt_list: # pkt_list contains a list of data for multiple metrics, each of which is pushed to the observer. # print(dateparser.parse(str(pkt['values'][0][0])), "-", dateparser.parse(str(pkt['values'][-1][0]))) try: observer.on_next(pkt) except Exception as e: print(pkt) # Check which pkt caused the exception raise (e) start += chunk_seconds pass
def setup_promethus(): port = os.environ.get("PROM_PORT", "9990") namespace, deployment = find_prometheus() port_forward = subprocess.Popen([ 'kubectl', '-n', namespace, 'port-forward', deployment, '%s:9090' % port ], stdout=subprocess.PIPE) port_forward.stdout.readline() # Wait for port forward to be ready return Prometheus('http://localhost:%s/' % port, pid=port_forward.pid)
def fetch( prometheus_server, output, start_date, ): p = Prometheus(prometheus_server) header_written = False start = datetime.strptime(start_date, '%Y/%m/%d').replace(tzinfo=timezone.utc) now = datetime.now(timezone.utc) with open(output, 'a') as f: # Truncate existing data f.truncate(0) while start < now: duration = timedelta(days=1) step = timedelta(minutes=1) data = p.range_query( 'max(bitflyer_last_traded_price{product_code="BTC_JPY"}) by (product_code)', start, duration, step, ) if len(data) > 0: series = pd.Series( data[0]['values'].T[1], index=data[0]['values'].T[0], ) df = series.to_frame(name='ltp') df.index = pd.to_datetime(df.index, unit='s') df.index.name = 'timestamp' df.to_csv(f, header=(not header_written)) header_written = True start += duration
def job(current_time): # TODO: Replace this function with model training function and set up the correct IntervalTrigger time global data_dict, predictions_dict_prophet, predictions_dict_fourier, current_metric_metadata, current_metric_metadata_dict, data_window, url, token, chunk_size, data_size, TRUE_LIST, store_intermediate_data global data, config_list # iteration += 1 start_time = time.time() prom = Prometheus(url=url, token=token, data_chunk=chunk_size, stored_data=data_size) metric = prom.get_metric(metric_name) print("metric collected.") # Convert data to json metric = json.loads(metric) # Metric Json is converted to a shaped dataframe data_dict = get_df_from_json( metric, data_dict, data_window ) # This dictionary contains all the sub-labels as keys and their data as Pandas DataFrames del metric, prom if str(store_intermediate_data) in TRUE_LIST: print( "DataFrame stored at: ", cp().store_data(metric_name, pickle.dumps(data_dict), (data_storage_path + str(datetime.now().strftime('%Y%m%d%H%M'))))) pass if fixed_label_config != "None": #If a label config has been specified single_label_data_dict = {} # split into multiple label configs existing_config_list = list(data_dict.keys()) # print(existing_config_list) for config in config_list: config_found = False for existing_config in existing_config_list: if SortedDict(literal_eval(existing_config)) == SortedDict( literal_eval(config)): single_label_data_dict[existing_config] = data_dict[ existing_config] config_found = True pass if not config_found: print("Specified Label Configuration {} was not found".format( config)) # raise KeyError pass # single_label_data_dict[config] = data_dict[config] pass # single_label_data_dict[fixed_label_config] = data_dict[fixed_label_config] current_metric_metadata = list(single_label_data_dict.keys())[0] current_metric_metadata_dict = literal_eval(current_metric_metadata) print(data_dict[current_metric_metadata].head(5)) print(data_dict[current_metric_metadata].tail(5)) print("Using the default label config") predictions_dict_prophet = predict_metrics(single_label_data_dict) # print(single_label_data_dict) predictions_dict_fourier = predict_metrics_fourier( single_label_data_dict) pass else: for x in data_dict: print(data_dict[x].head(5)) print(data_dict[x].tail(5)) break pass predictions_dict_prophet = predict_metrics(data_dict) predictions_dict_fourier = predict_metrics_fourier(data_dict) # TODO: Trigger Data Pruning here function_run_time = time.time() - start_time print( "Total time taken to train was: {} seconds.".format(function_run_time)) pass
def metrics(): global predictions_dict_prophet, predictions_dict_fourier, current_metric_metadata, current_metric_metadata_dict, metric_name, url, token, live_data_dict for metadata in predictions_dict_prophet: #Find the index matching with the current timestamp index_prophet = predictions_dict_prophet[metadata].index.get_loc( datetime.now(), method='nearest') index_fourier = predictions_dict_fourier[metadata].index.get_loc( datetime.now(), method='nearest') current_metric_metadata = metadata print("The current time is: ", datetime.now()) print("The matching index for Prophet model found was: \n", predictions_dict_prophet[metadata].iloc[[index_prophet]]) print("The matching index for Fourier Transform found was: \n", predictions_dict_fourier[metadata].iloc[[index_fourier]]) current_metric_metadata_dict = literal_eval(metadata) temp_current_metric_metadata_dict = current_metric_metadata_dict.copy() # delete the "__name__" key from the dictionary as we don't need it in labels (it is a non-permitted label) when serving the metrics del temp_current_metric_metadata_dict["__name__"] # TODO: the following function does not have good error handling or retry code in case of get request failure, need to fix that # Get the current metric value which will be compared with the predicted value to detect an anomaly metric = (Prometheus(url=url, token=token).get_current_metric_value( metric_name, temp_current_metric_metadata_dict)) # print("metric collected.") # Convert data to json metric = json.loads(metric) # Convert the json to a dictionary of pandas dataframes live_data_dict = get_df_from_single_value_json(metric, live_data_dict) # Trim the live data dataframe to only 5 most recent values live_data_dict[metadata] = live_data_dict[metadata][-5:] # print(live_data_dict) # Update the metric values for prophet model PREDICTED_VALUES_PROPHET.labels( **temp_current_metric_metadata_dict).set( predictions_dict_prophet[metadata]['yhat'][index_prophet]) PREDICTED_VALUES_PROPHET_UPPER.labels( **temp_current_metric_metadata_dict).set( predictions_dict_prophet[metadata]['yhat_upper'] [index_prophet]) PREDICTED_VALUES_PROPHET_LOWER.labels( **temp_current_metric_metadata_dict).set( predictions_dict_prophet[metadata]['yhat_lower'] [index_prophet]) # Update the metric values for fourier transform model PREDICTED_VALUES_FOURIER.labels( **temp_current_metric_metadata_dict).set( predictions_dict_fourier[metadata]['yhat'][index_fourier]) PREDICTED_VALUES_FOURIER_UPPER.labels( **temp_current_metric_metadata_dict).set( predictions_dict_fourier[metadata]['yhat_upper'] [index_fourier]) PREDICTED_VALUES_FOURIER_LOWER.labels( **temp_current_metric_metadata_dict).set( predictions_dict_fourier[metadata]['yhat_lower'] [index_fourier]) # TypeError: Invalid comparison between dtype=datetime64[ns] and int # if len(live_data_dict[metadata] >= 5): # pass # # Update the metric values for detected anomalies 1 in case of anomaly, 0 if not # if (detect_anomalies(predictions_dict_fourier[metadata][len(predictions_dict_fourier[metadata])-(len(live_data_dict[metadata])):],live_data_dict[metadata])): # PREDICTED_ANOMALY_FOURIER.labels(**temp_current_metric_metadata_dict).set(1) # else: # PREDICTED_ANOMALY_FOURIER.labels(**temp_current_metric_metadata_dict).set(0) # # if (detect_anomalies(predictions_dict_prophet[metadata][len(predictions_dict_prophet[metadata])-(len(live_data_dict[metadata])):],live_data_dict[metadata])): # PREDICTED_ANOMALY_PROPHET.labels(**temp_current_metric_metadata_dict).set(1) # else: # PREDICTED_ANOMALY_PROPHET.labels(**temp_current_metric_metadata_dict).set(0) # pass return Response(generate_latest(REGISTRY).decode("utf-8"), content_type='text; charset=utf-8')
# Chunk size, download the complete data, but in smaller chunks, should be less than or equal to DATA_SIZE chunk_size = str(os.getenv('CHUNK_SIZE', '1d')) # Net data size to scrape from prometheus data_size = str(os.getenv('DATA_SIZE', '1d')) # Number of minutes, the model should predict the values for # PREDICT_DURATION=1440 # minutes, 1440 = 24 Hours # Limit to first few labels of the metric # LABEL_LIMIT = None # Preparing a connection to Prometheus host prom = Prometheus(url=url, token=token, data_chunk=chunk_size, stored_data=data_size) # Get metric data from Prometheus metric = prom.get_metric(metric_name) print("metric collected.") del prom # Convert data to json metric = json.loads(metric) # print(metric) # Metric Json is converted to a shaped dataframe pd_dict = get_df_from_json( metric
class TestAlarms(unittest.TestCase): def test_pilot(self): queries = [ Query( "Pilot: XDS rejections", 'pilot_total_xds_rejects', Alarm(lambda errors: errors > 0, 'There should not be any rejected XDS pushes'), None) ] self.run_queries(queries) def test_graceful_shutdown(self): queries = [ *standard_queries('istio-stability-graceful-shutdown'), istio_requests_sanity('istio-stability-graceful-shutdown') ] self.run_queries(queries) def test_http_10(self): queries = [ *standard_queries('istio-stability-http10'), istio_requests_sanity('istio-stability-http10') ] self.run_queries(queries) def test_mysql(self): queries = [ # TODO get clientside metrics *standard_queries('istio-stability-mysql') ] self.run_queries(queries) def test_load_test(self): queries = [ *standard_queries('service-graph..', cpu_lim=250, mem_lim=100) ] self.run_queries(queries) def test_redis(self): queries = [stability_query(source='redis-client', test='redis')] self.run_queries(queries) def test_rabbitmq(self): queries = [stability_query(source='rabbitmq-client', test='rabbitmq')] self.run_queries(queries) @classmethod def setUpClass(self): port = os.environ.get("PROM_PORT", "9990") namespace, deployment = find_prometheus() self.port_forward = subprocess.Popen([ 'kubectl', '-n', namespace, 'port-forward', deployment, '%s:9090' % port ], stdout=subprocess.PIPE) self.port_forward.stdout.readline( ) # Wait for port forward to be ready self.prom = Prometheus('http://localhost:%s/' % port) @classmethod def tearDownClass(self): self.port_forward.stdout.close() # Wait for port forward to be ready self.port_forward.terminate() self.port_forward.wait() def run_queries(self, queries): for query in queries: with self.subTest(name=query.description): if query.running_query: if self.prom.fetch_value(query.running_query) == 0: self.skipTest("Test is not running") errors = self.prom.run_query(query) message = 'Alarms Triggered:' for e in errors: message += '\n- ' + e assert_empty(errors, message)
def envoy_cds_version_count(prom: Prometheus): return prom.fetch_value( 'count(count_values("value", envoy_cluster_manager_cds_version))')
class Prometheus_Query: p = Prometheus() instance_name = "10.244.0.85:9308" oc = OC() def __init__(self): ns, ip, port = self.get_kafka_exporter_ip() if ip and port: self.instance_name = "%s:%s" % (ip, port) def get_kafka_exporter_ip(self): ns = "" ip = "" port = "" output = self.oc.get_services_all_namespace() try: for line in output.split("\n"): if line.find("my-cluster-kafka-exporter") != -1: ns = line.split()[0] ip = line.split()[3] port = line.split()[5].split("/")[0].split(":")[0] except Exception as e: print "it cannot find kafka exporter ip: %s" % str(e) return ns, ip, port print "find namespace (%s) exporter ip (%s:%s)" % (ns, ip, port) return ns, ip, port def query_lag(self): # cmd = 'sum(kafka_consumergroup_lag{instance="%s",topic=~"%s"}) by (consumergroup, topic)' % (self.instance_name, topic_name) cmd = 'sum(kafka_consumergroup_lag{topic=~"%s"})' % (topic_name) output = self.p.run_cmd(cmd) return output def query_avg_lag(self): cmd = 'avg_over_time(kafka_consumergroup_lag{topic="%s",consumergroup="%s"}[1m])' % ( topic_name, group_name) output = self.p.run_cmd(cmd) return output def query_log_offset(self): cmd = 'sum(kafka_topic_partition_current_offset{topic=~"%s"})' % ( topic_name) output = self.p.run_cmd(cmd) return output def query_log_offset_by_min(self): cmd = 'sum(delta(kafka_topic_partition_current_offset{topic=~"%s"}[3m])/3)' % ( topic_name) output = self.p.run_cmd(cmd) return output def query_log_offset_by_sec(self): cmd = 'sum(rate(kafka_topic_partition_current_offset{topic=~"%s"}[1m]))' % ( topic_name) output = self.p.run_cmd(cmd) return output def query_current_offset(self): cmd = 'sum(kafka_consumergroup_current_offset{topic=~"%s"})' % ( topic_name) output = self.p.run_cmd(cmd) return output def query_current_offset_by_min(self): cmd = 'sum(delta(kafka_consumergroup_current_offset{topic=~"%s"}[3m])/3)' % ( topic_name) output = self.p.run_cmd(cmd) return output def query_current_offset_by_sec(self): cmd = 'sum(rate(kafka_consumergroup_current_offset{topic=~"%s"}[1m]))' % ( topic_name) output = self.p.run_cmd(cmd) return output def query_lag_by_sec(self): cmd = 'sum(rate(kafka_consumergroup_lag{topic=~"%s"}[1m]))' % ( topic_name) output = self.p.run_cmd(cmd) return output def query_lag_by_min(self): cmd = 'sum(delta(kafka_consumergroup_lag{topic=~"%s"}[3m])/3)' % ( topic_name) output = self.p.run_cmd(cmd) return output def query_pod_start_time(self, pod_name): cmd = 'kube_pod_start_time{pod="%s"}' % pod_name output = self.p.run_cmd(cmd) return output def wait_time(self, value): # print "wait %d seconds" % value time.sleep(value)
def observe_prom_metrics_range(self, observer, metrics_list, start_time, end_time='now', chunk_size='1h'): # Collect credentials to connect to a prometheus instance prom_token = os.getenv("FLT_PROM_ACCESS_TOKEN") prom_url = os.getenv("FLT_PROM_URL") if not (prom_token or prom_url): sys.exit("Error: Prometheus credentials not found") prom = Prometheus(url=prom_url, token=prom_token) # Calculate chunk size to download and push to the observer at each instance chunk_seconds = int( round((dateparser.parse('now') - dateparser.parse(chunk_size)).total_seconds())) start = round(dateparser.parse(start_time).timestamp(), 0) end = round(dateparser.parse(end_time).timestamp(), 0) _LOGGER.info( "Collecting metric data within datetime range:{0} - {1}".format( dateparser.parse(str(start)), dateparser.parse(str(end)))) current_latest_timestamp = 0 while start < end: # Main loop which iterates through time-ranges to collect a chunk of data at every iteration chunk_end_time = start + chunk_seconds - 1 # Increment the metric chunk time to collect the next chunk if ( start + chunk_seconds ) >= end: # When the specified start-end datetime range is not divisible by the specified chunk time chunk_end_time = end # Reduce the size of the last chunk to fit the specified datetime frame for metric_name in metrics_list: # Loop to get a chunk of data for every metric in the list _LOGGER.info( "Current Chunk Info: Metric = {0}, Time range = {1} - {2}". format(metric_name, dateparser.parse(str(start)), dateparser.parse(str(chunk_end_time)))) pkt_list = (prom.get_metric_range_data( metric_name=metric_name, start_time=start, end_time=chunk_end_time)) _LOGGER.info("Collected {0} packets.".format(len(pkt_list))) for pkt in pkt_list: # pkt_list contains a list of data for multiple metrics, each of which is pushed to the observer. # print(dateparser.parse(str(pkt['values'][0][0])), "-", dateparser.parse(str(pkt['values'][-1][0]))) if pkt['values'][-1][0] > current_latest_timestamp: current_latest_timestamp = pkt['values'][-1][0] try: observer.on_next(pkt) except Exception as e: _LOGGER.error( "{0}, while processing the following metric packet: \n{1}" .format(str(e), str( pkt))) # Check which pkt caused the exception raise (e) self.final_packet_timestamp[metric_name] = ( current_latest_timestamp) start += chunk_seconds pass