def test_create_retention_policy(self): """Test create retention policy for TestInfluxDBClient object.""" cli = DataFrameClient(database='db') example_response = '{"results":[{}]}' with requests_mock.Mocker() as m: m.register_uri(requests_mock.POST, "http://localhost:8086/query", text=example_response) cli.create_retention_policy('somename', '1d', 4, database='db') self.assertEqual( m.last_request.qs['q'][0], 'create retention policy "somename" on ' '"db" duration 1d replication 4 shard duration 0s')
def test_create_retention_policy(self): """Test create retention policy for TestInfluxDBClient object.""" cli = DataFrameClient(database='db') example_response = '{"results":[{}]}' with requests_mock.Mocker() as m: m.register_uri( requests_mock.POST, "http://localhost:8086/query", text=example_response ) cli.create_retention_policy( 'somename', '1d', 4, database='db' ) self.assertEqual( m.last_request.qs['q'][0], 'create retention policy "somename" on ' '"db" duration 1d replication 4 shard duration 0s' )
class tsdb(object): def __init__(self, dbname, host='localhost', port=8086, user='******', password='******'): self.host = host self.port = port self.user = user self.password = password self.dbname = dbname self.client = None self.protocol = 'json' def _connect(self): if self.client is None: self.client = DataFrameClient(host=self.host, port=self.port, username=self.user, password=self.password, database=self.dbname) #self.client.switch_database(self.dbname) def _disconnect(self): if self.cleint is not None: self.client.close() self.client = None def _reconnet(self): self._disconnect() self._connect() def create_db(self): self._connect() dbs = self.client.get_list_database() for e in dbs: if self.dbname in e.values(): logger.debug("Database {} is already exist.".format( self.dbname)) return logger.info("Creating database:{}".format(self.dbname)) self.client.create_database(self.dbname) #self._set_retantion_policy() def _set_retantion_policy(self): self._connect() self.client.create_retention_policy(name='raw', duration='12h', replication=1, default=True) self.client.create_retention_policy(name='cooked', duration='52w', replication=1, default=False) def check_db(self): self._connect() db = self.client.get_list_database() ms = self.client.get_list_measurements() rp = self.client.get_list_retention_policies(self.dbname) user = self.client.get_list_users() print('db: {}, measurements: {}'.format(db, ms)) print('retention policy: {}'.format(rp)) print('users: {}'.format(user)) def insert(self, df, measurement, tags=None): self._connect() try: result = self.client.write_points(df, measurement, tags=tags, time_precision='n', protocol=self.protocol) except: logger.info('influxdb write error') result = False return result def query(self, sql): self._connect() result = self.client.query(sql) return result
class MetricsConsumer(object): def __init__(self, app_slo, app_config_file, config_file, app_id): with open(config_file) as json_data: self.config = json.load(json_data) with open(app_config_file) as json_data: self.app_metric_config = json.load(json_data) if "metric" in app_slo: self.app_metric_config["metric"] = app_slo["metric"] if "threshold" in app_slo: self.app_metric_config["threshold"] = app_slo["threshold"] self.group_keys = {"intel/docker": "io.kubernetes.pod.name"} self.default_group_key = "nodename" influx_host = config.get("INFLUXDB", "HOST") influx_port = config.get("INFLUXDB", "PORT") influx_user = config.get("INFLUXDB", "USERNAME") influx_password = config.get("INFLUXDB", "PASSWORD") self.influx_client = DataFrameClient( influx_host, influx_port, influx_user, influx_password, config.get("INFLUXDB", "RAW_DB_NAME")) self.app_influx_client = DataFrameClient( influx_host, influx_port, influx_user, influx_password, config.get("INFLUXDB", "APP_DB_NAME")) self.deployment_id = '' derived_db = config.get("INFLUXDB", "DERIVED_METRIC_DB_NAME") self.derived_influx_client = DataFrameClient(influx_host, influx_port, influx_user, influx_password, derived_db) self.derived_influx_client.create_database(derived_db) self.derived_influx_client.create_retention_policy( 'derived_metric_policy', '5w', 1, default=True) self.logger = logging.getLogger(app_id) def get_app_threshold(self): threshold_config = self.app_metric_config["threshold"] value = float(threshold_config["value"]) unit = threshold_config["unit"] # We app is measured in seconds if unit == "ms": value = value / 1000. return value def get_app_metric(self, start_time, end_time, is_derived=True): self.logger.info( "Start processing app metric with app_metric_config %s" % self.app_metric_config) metric_name = self.app_metric_config["metric"]["name"] aggregation = self.app_metric_config["analysis"]["aggregation"] self.incident_type = self.app_metric_config["type"] self.incident_metric = self.app_metric_config["metric"] self.incident_threshold = self.app_metric_config["threshold"] time_filter = "WHERE time >= %d AND time <= %d" % (start_time, end_time) app_metric_query = ("SELECT time, %s(value) as value FROM \"%s\" %s" % (aggregation, metric_name, time_filter)) if "tags" in self.app_metric_config["metric"]: tags = self.app_metric_config["metric"]["tags"] tags_filter = " AND ".join( ["\"%s\"='%s'" % (tag["key"], tag["value"]) for tag in tags]) app_metric_query += (" AND %s" % (tags_filter)) app_metric_query += (" GROUP BY time(%ds) fill(none)" % (SAMPLE_INTERVAL)) self.logger.debug("app_metric_query = %s" % (app_metric_query)) df = self.app_influx_client.query(app_metric_query) self.logger.debug("App metric query completed") if metric_name not in df: return None if is_derived == False: return df[metric_name] if df[metric_name] is not None: slo_state = WindowState( self.app_metric_config["analysis"]["observation_window_sec"], self.get_app_threshold(), SAMPLE_INTERVAL, ) df[metric_name]["value"] = df[metric_name].apply( lambda row: slo_state.compute_derived_value( row.name.value, row.value), axis=1, ) return df[metric_name] def get_raw_metrics(self, start_time, end_time, app_metric=None): metrics_result = MetricsResults(self.derived_influx_client, is_derived=False) if app_metric is not None: metrics_result.set_app_metric( app_metric, self.app_metric_config["metric"]["name"]) time_filter = "WHERE time >= %d AND time <= %d" % (start_time, end_time) for metric_config in self.config: metric_source = str(metric_config["metric_name"]) group_name = self.default_group_key is_container_metric = False for k, v in self.group_keys.items(): if k in metric_source: group_name = v is_container_metric = True break if metric_source.startswith("/"): metric_source = metric_source[1:] # construct tags_filter if needed for this metric tags_filter = None if "tags" in metric_config: tags = metric_config["tags"] tags_filter = " AND ".join( ["\"%s\"='%s'" % (k, v) for k, v in tags.items()]) raw_metrics_query = ("SELECT * FROM \"%s\" %s" % (metric_source, time_filter)) if tags_filter: raw_metrics_query += (" AND %s" % (tags_filter)) raw_metrics = self.influx_client.query(raw_metrics_query) metrics_thresholds = {} raw_metrics_len = len(raw_metrics) if raw_metrics_len == 0: self.logger.info( "Unable to find data for %s, skipping this metric..." % (metric_source)) continue for metric_group_name in raw_metrics[metric_source][ group_name].unique(): if not metric_group_name or metric_group_name == "": self.logger.info("Unable to find %s in metric %s" % (metric_group_name, metric_source)) continue df = raw_metrics[metric_source] dfg = df.groupby(group_name) metrics_result.add_metric( metric_source, metric_source, is_container_metric, dfg, metric_config["resource"], metric_config["analysis"]["observation_window_sec"], metric_config["threshold"]["value"], metric_config["threshold"]["type"], metric_config["threshold"]["unit"]) return metrics_result def get_derived_metrics(self, start_time, end_time, app_metric=None): derived_metrics_result = MetricsResults(self.derived_influx_client, is_derived=True) if app_metric is not None: derived_metrics_result.set_app_metric( app_metric, self.app_metric_config["metric"]["name"] + "/" + self.app_metric_config["type"]) node_metric_keys = "value,nodename,deploymentId" container_metric_keys = "value,\"io.kubernetes.pod.name\",nodename,deploymentId" time_filter = "WHERE time > %d AND time <= %d" % (start_time, end_time) self.logger.info("Start processing infrastructure metrics") for metric_config in self.config: metric_source = str(metric_config["metric_name"]) group_name = self.default_group_key is_container_metric = False for k, v in self.group_keys.items(): if k in metric_source: group_name = v is_container_metric = True break metric_type = metric_config["type"] new_metric_name = metric_source + "/" + metric_type if metric_source.startswith("/"): metric_source = metric_source[1:] # construct tags_filter if needed for this metric tags_filter = None if "tags" in metric_config: tags = metric_config["tags"] tags_filter = " AND ".join( ["\"%s\"='%s'" % (k, v) for k, v in tags.items()]) # fetch raw metric values from influxdb raw_metrics = None if is_container_metric: raw_metrics_query = ( "SELECT %s FROM \"%s\" %s" % (container_metric_keys, metric_source, time_filter)) else: raw_metrics_query = ( "SELECT %s FROM \"%s\" %s" % (node_metric_keys, metric_source, time_filter)) if tags_filter: raw_metrics_query += (" AND %s" % (tags_filter)) self.logger.debug("raw metrics for derived metrics query = %s" % (raw_metrics_query)) raw_metrics = self.influx_client.query(raw_metrics_query) self.logger.debug("raw metrics query completed") if len(raw_metrics) == 0: self.logger.info( "Unable to find data for %s; skipping this metric..." % (metric_source)) continue metric_df = raw_metrics[metric_source] metric_group_states = {} self.deployment_id = metric_df.loc[:, "deploymentId"][0] # fetch normalizer metric values if normalization is needed normalizer_metrics = None if "normalizer" in metric_config: normalizer = str(metric_config["normalizer"]) new_metric_name = metric_source + "_normalized/" + metric_type if normalizer.startswith("/"): normalizer = normalizer[1:] if is_container_metric: normalizer_metrics_query = ( "SELECT %s FROM \"%s\" %s" % (container_metric_keys, normalizer, time_filter)) else: normalizer_metrics_query = ( "SELECT %s FROM \"%s\" %s" % (node_metric_keys, normalizer, time_filter)) if tags_filter: normalizer_metrics_query += (" AND %s" % (tags_filter)) self.logger.debug("normalizer metrics query = %s" % (normalizer_metrics_query)) normalizer_metrics = self.influx_client.query( normalizer_metrics_query) if len(normalizer_metrics) == 0: self.logger.info( "Unable to find data for normalizer %s; skipping metric %s..." % (normalizer, metric_source)) continue normalizer_df = normalizer_metrics[normalizer] if normalizer_df["value"].max() == 0: self.logger.info( "All zero values in normalizer %s, skipping metric %s..." % (normalizer, metric_source)) continue self.logger.debug( "Converting raw metric %s\n into derived metric %s" % (metric_source, new_metric_name)) # process metric values for each group # metric_group_name = nodename for node metrics, pod.name for container metrics for metric_group_name in raw_metrics[metric_source][ group_name].unique(): if not metric_group_name or metric_group_name == "": self.logger.info("Unable to find %s in metric %s" % (metric_group_name, metric_source)) continue if metric_group_name not in metric_group_states: new_state = WindowState( metric_config["observation_window_sec"], metric_config["threshold"]["value"], SAMPLE_INTERVAL, ) metric_group_states[metric_group_name] = new_state metric_group_ind = metric_df.loc[metric_df[group_name] == metric_group_name].index # perform normalization if needed for raw metrics in each group if normalizer_metrics: normalizer_group_ind = normalizer_df.loc[ normalizer_df[group_name] == metric_group_name].index if normalizer_df.loc[normalizer_group_ind, "value"].max() == 0: self.logger.debug( "Normalizer metric has all zeros for group %s; " % (metric_group_name) + "dropping this group from the raw metric...") metric_df = metric_df.drop(metric_group_ind) continue if len(normalizer_group_ind) != len(metric_group_ind): self.logger.warning( "Normalizer does not have equal length as raw metric; " + "adjusting...") minlen = min(len(metric_group_ind), len(normalizer_group_ind)) metric_group_ind = metric_group_ind[:minlen] normalizer_group_ind = normalizer_group_ind[:minlen] metric_df.loc[metric_group_ind, "value"] = ( metric_df.loc[metric_group_ind, "value"] / normalizer_df.loc[normalizer_group_ind, "value"].data) # compute derived metric values using configured threshold info self.logger.debug( "raw metric before applying threshold for group %s" % (metric_group_name)) self.logger.debug( metric_df.loc[metric_group_ind, [group_name, "value"]].to_string( index=False)) metric_df.loc[ metric_group_ind, "value"] = metric_df.loc[metric_group_ind].apply( lambda row: metric_group_states[row[group_name]]. compute_derived_value(row.name.value, row.value), axis=1, ) self.logger.debug( "derived metric after applying threshold for group %s" % (metric_group_name)) self.logger.debug( metric_df.loc[metric_group_ind, [group_name, "value"]].to_string( index=False)) metric_dfg = metric_df.groupby(group_name) derived_metrics_result.add_metric( metric_source, new_metric_name, is_container_metric, metric_dfg, metric_config["resource"], metric_config["observation_window_sec"], metric_config["threshold"]["value"], metric_config["threshold"]["type"], metric_config["threshold"]["unit"]) return derived_metrics_result
class SizingAnalyzer(object): def __init__(self, config): self.config = config self.logger = get_logger(__name__, log_level=("UTILIZATION", "LOGLEVEL")) self.percentiles = ast.literal_eval( config.get("UTILIZATION", "PERCENTILES")) self.stat_type = config.get("UTILIZATION", "DEFAULT_STAT_TYPE") self.scaling_factor = float( self.config.get("UTILIZATION", "DEFAULT_SCALING_FACTOR")) influx_host = config.get("INFLUXDB", "HOST") influx_port = config.get("INFLUXDB", "PORT") influx_user = config.get("INFLUXDB", "USERNAME") influx_password = config.get("INFLUXDB", "PASSWORD") input_db = config.get("INFLUXDB", "SIZING_INPUT_DB_NAME") output_db = config.get("INFLUXDB", "SIZING_OUTPUT_DB_NAME") self.resultdb = resultdb self.results_collection = config.get("UTILIZATION", "SIZING_RESULTS_COLLECTION") self.influx_client_input = DataFrameClient(influx_host, influx_port, influx_user, influx_password, input_db) self.influx_client_output = DataFrameClient(influx_host, influx_port, influx_user, influx_password, output_db) self.influx_client_output.create_database(output_db) self.influx_client_output.create_retention_policy( 'sizing_result_policy', '4w', 1, default=True) def analyze_node_cpu(self, start_time, end_time, stat_type=None, scaling_factor=None): if stat_type is not None: self.stat_type = stat_type if scaling_factor is not None: self.scaling_factor = scaling_factor self.base_metric = 'usage' self.logger.info( "-- [node_cpu] Query influxdb for raw metrics data --") output_filter = "derivative(sum(value), 1s) as usage" time_filter = "time > %d AND time <= %d" % (start_time, end_time) tags_filter = "AND mode=~ /(user|system)/" group_tags = self.config.get("UTILIZATION", "NODE_GROUP_TAGS") group_by_tags = group_tags + ",time(1ms)" metric_name = "node_cpu" try: node_cpu_usage_dict = self.query_influx_metric( metric_name, output_filter, time_filter, tags_filter, group_by_tags) except Exception as e: return JobStatus(status=Status.DB_ERROR, error=str(e)) self.logger.info("-- [node_cpu] Compute summary stats --") node_cpu_summary = pd.DataFrame() new_metric = "node_cpu_usage" for k, df in node_cpu_usage_dict.items(): group_key = dict((x, y) for x, y in k[1]) #df.drop(df.index[(df.usage > MAX_CORES) | (df.usage < 0)], inplace=True) try: self.influx_client_output.write_points(df, new_metric, group_key) except Exception as e: return JobStatus( status=Status.DB_ERROR, error="Unable to write query result for %s to influxDB: %s" % (new_metric, str(e))) node_key = "instance=" + group_key['instance'] node_cpu_summary[node_key, self.base_metric] = df.usage.describe( self.percentiles).drop(['count', 'std', 'min']) node_cpu_summary.rename(index={ '50%': "median", '95%': "p95", '99%': "p99" }, inplace=True) self.logger.debug("Computed node cpu usage summary:\n %s" % node_cpu_summary.to_json()) self.logger.info( "-- [node_cpu] Query influxdb for current node configs --") metric_name = "machine_cpu_cores" try: node_cpu_size_dict = self.query_influx_metric( metric_name, "value", time_filter, "", group_tags) except Exception as e: return JobStatus(status=Status.DB_ERROR, error=str(e)) current_cpu_sizes = self.get_node_sizes(node_cpu_size_dict, "node_cpu_size") self.logger.info("Current node cpu sizes (in #cores):\n %s" % current_cpu_sizes) self.logger.info("-- [node_cpu] Compute sizing recommendation --") recommended_cpu_sizes = self.recommend_node_sizes(node_cpu_summary) self.logger.info("Recommended node cpu sizes (in #cores):\n %s" % recommended_cpu_sizes) self.logger.info("-- [node_cpu] Store analysis results in mongodb --") results = self.construct_analysis_results("cpu", node_cpu_summary, recommended_cpu_sizes, current_cpu_sizes) sizing_result_doc = { "object_type": "node", "resource": "cpu", "unit": "cores", "start_time": start_time, "end_time": end_time, "labels": group_tags.split(","), "config": {}, "results": results } try: self.store_analysis_results(sizing_result_doc) except Exception as e: return JobStatus(status=Status.DB_ERROR, error=str(e)) return JobStatus(status=Status.SUCCESS, data=sizing_result_doc) def analyze_node_memory(self, start_time, end_time, stat_type=None, scaling_factor=None, base_metric=None): if stat_type is not None: self.stat_type = stat_type if scaling_factor is not None: self.scaling_factor = scaling_factor if base_metric is not None: self.base_metric = base_metric else: self.base_metric = self.config.get("UTILIZATION", "MEMORY_BASE_METRIC") self.logger.info( "-- [node_memory] Query influxdb for raw metrics data --") output_filter = "value/1024/1024/1024" time_filter = "time > %d AND time <= %d" % (start_time, end_time) group_tags = self.config.get("UTILIZATION", "NODE_GROUP_TAGS") metric_name_active = "node_memory_Active" try: node_mem_active_dict = self.query_influx_metric( metric_name_active, output_filter, time_filter, "", group_tags) except Exception as e: return JobStatus(status=Status.DB_ERROR, error=str(e)) metric_name_total = "node_memory_MemTotal" try: node_mem_total_dict = self.query_influx_metric( metric_name_total, output_filter, time_filter, "", group_tags) except Exception as e: return JobStatus(status=Status.DB_ERROR, error=str(e)) metric_name_free = "node_memory_MemFree" try: node_mem_free_dict = self.query_influx_metric( metric_name_free, output_filter, time_filter, "", group_tags) except Exception as e: return JobStatus(status=Status.DB_ERROR, error=str(e)) self.logger.info("-- [node_memory] Compute summary stats --") node_mem_summary = pd.DataFrame() new_metric = "node_memory_active" for k, df_active in node_mem_active_dict.items(): group_key = dict((x, y) for x, y in k[1]) try: self.influx_client_output.write_points(df_active, new_metric, group_key) except Exception as e: return JobStatus( status=Status.DB_ERROR, error="Unable to write query result for %s to influxDB: %s" % (new_metric, str(e))) node_key = "instance=" + group_key['instance'] node_mem_summary[node_key, 'active'] = df_active.value.describe( self.percentiles).drop(['count', 'std', 'min']) new_metric = "node_memory_usage" for k in node_mem_total_dict.keys(): group_key = dict((x, y) for x, y in k[1]) df_total = node_mem_total_dict[k] k_free = (metric_name_free, k[1]) df_free = node_mem_free_dict[k_free] df_usage = df_total - df_free try: self.influx_client_output.write_points(df_usage, new_metric, group_key) except Exception as e: return JobStatus( status=Ststus.DB_ERROR, error="Unable to write query result for %s to influxDB: %s" % (new_metric, str(e))) node_key = "instance=" + group_key['instance'] node_mem_summary[node_key, 'usage'] = df_usage.value.describe( self.percentiles).drop(['count', 'std', 'min']) node_mem_summary.rename(index={ '50%': "median", '95%': "p95", '99%': "p99" }, inplace=True) self.logger.debug("Computed node memory usage summary:\n %s" % node_mem_summary.to_json()) self.logger.info( "-- [node_memory] Query influxdb for current node configs --") try: node_mem_size_dict = self.query_influx_metric( "machine_memory_bytes", output_filter, time_filter, "", group_tags) except Exception as e: return JobStatus(status=Status.DB_ERROR, error=str(e)) current_mem_sizes = self.get_node_sizes(node_mem_size_dict, "node_memory_size") self.logger.info("Current node memory sizes (in GB):\n %s" % current_mem_sizes) self.logger.info("-- [node_memory] Compute sizing recommendation --") recommended_mem_sizes = self.recommend_node_sizes(node_mem_summary) self.logger.info("Recommended node memory sizes (in GB):\n %s" % recommended_mem_sizes) self.logger.info( "-- [node_memory] Store analysis results in mongodb --") results = self.construct_analysis_results("memory", node_mem_summary, recommended_mem_sizes, current_mem_sizes) sizing_result_doc = { "object_type": "node", "resource": "memory", "unit": "GB", "start_time": start_time, "end_time": end_time, "labels": group_tags.split(","), "config": {}, "results": results } try: self.store_analysis_results(sizing_result_doc) except Exception as e: return JobStatus(status=Status.DB_ERROR, error=str(e)) return JobStatus(status=Status.SUCCESS, data=sizing_result_doc) def analyze_container_cpu(self, start_time, end_time, stat_type=None, scaling_factor=None): if stat_type is not None: self.stat_type = stat_type if scaling_factor is not None: self.scaling_factor = scaling_factor self.base_metric = 'usage' self.logger.info( "-- [container_cpu] Query influxdb for raw metrics data --") output_filter = "derivative(sum(value), 1s) as usage" time_filter = "time > %d AND time <= %d" % (start_time, end_time) tags_filter = "AND image!=''" group_tags = self.config.get("UTILIZATION", "CONTAINER_GROUP_TAGS") group_by_tags = group_tags + ",pod_name,time(1ms)" metric_name_usr = "******" try: container_cpu_user_dict = self.query_influx_metric( metric_name_usr, output_filter, time_filter, tags_filter, group_by_tags) except Exception as e: return JobStatus(status=Status.DB_ERROR, error=str(e)) metric_name_sys = "container_cpu_system_seconds_total" try: container_cpu_sys_dict = self.query_influx_metric( metric_name_sys, output_filter, time_filter, tags_filter, group_by_tags) except Exception as e: return JobStatus(status=Status.DB_ERROR, error=str(e)) self.logger.info("-- [container_cpu] Compute summary stats --") df_usage = pd.DataFrame() container_cpu_usage_dict = {} for k_user, df_user in container_cpu_user_dict.items(): group_key = k_user[1] df_sys = container_cpu_sys_dict[(metric_name_sys, group_key)] df_usage = (df_user + df_sys).astype('float32') if group_key not in container_cpu_usage_dict.keys(): container_cpu_usage_dict[group_key] = df_usage else: df_comb = pd.merge_asof(container_cpu_usage_dict[group_key], df_usage, left_index=True, right_index=True, suffixes=('_1', '_2'), direction='nearest') container_cpu_usage_dict[group_key].usage = df_comb[[ 'usage_1', 'usage_2' ]].max(axis=1) container_cpu_summary = pd.DataFrame() output_metric = "container_cpu_usage" for k, df_usage in container_cpu_usage_dict.items(): group_key = dict((x, y) for x, y in k) df_usage = df_usage.dropna() try: self.influx_client_output.write_points(df_usage, output_metric, group_key) except Exception as e: return JobStatus( status=Status.DB_ERROR, error="Unable to write query result for %s to influxDB: %s" % (output_metric, str(e))) image_key = "image=" + group_key['image'] container_cpu_summary[image_key, self.base_metric] = df_usage.usage.describe( self.percentiles).drop( ['count', 'std', 'min']) container_cpu_summary.rename(index={ '50%': "median", '95%': "p95", '99%': "p99" }, inplace=True) self.logger.debug("Computed container cpu usage summary:\n %s" % container_cpu_summary.to_json()) self.logger.info( "-- [container_cpu] Query influxdb for current requests and limits --" ) output_filter = "sum(value) as value" group_by_tags = group_tags + ",pod_name,time(5s)" try: cpu_quota_dict = self.query_influx_metric( "container_spec_cpu_quota", output_filter, time_filter, tags_filter, group_by_tags) except Exception as e: return JobStatus(status=Status.DB_ERROR, error=str(e)) try: cpu_period_dict = self.query_influx_metric( "container_spec_cpu_period", output_filter, time_filter, tags_filter, group_by_tags) except Exception as e: return JobStatus(status=Status.DB_ERROR, error=str(e)) output_metric = "container_cpu_settings" try: current_cpu_settings = self.get_container_cpu_settings( cpu_quota_dict, cpu_period_dict, output_metric) except Exception as e: return JobStatus(status=Status.DB_ERROR, error=str(e)) self.logger.info("Current container cpu settings (in #cores):\n %s" % current_cpu_settings) self.logger.info("-- [container_cpu] Compute requests and limits --") recommended_cpu_settings = self.recommend_container_cpu_settings( container_cpu_summary) self.logger.info( "Recommended container cpu settings (in #cores):\n %s" % recommended_cpu_settings) self.logger.info( "-- [container_cpu] Store analysis results in mongodb --") results = self.construct_analysis_results("cpu", container_cpu_summary, recommended_cpu_settings, current_cpu_settings) sizing_result_doc = { "object_type": "container", "resource": "cpu", "unit": "cores", "start_time": start_time, "end_time": end_time, "labels": group_tags.split(","), "config": {}, "results": results } try: self.store_analysis_results(sizing_result_doc) except Exception as e: return JobStatus(status=Status.DB_ERROR, error=str(e)) return JobStatus(status=Status.SUCCESS, data=sizing_result_doc) def analyze_container_memory(self, start_time, end_time, stat_type=None, scaling_factor=None, base_metric=None): if stat_type is not None: self.stat_type = stat_type if scaling_factor is not None: self.scaling_factor = scaling_factor if base_metric is not None: self.base_metric = base_metric else: self.base_metric = self.config.get("UTILIZATION", "MEMORY_BASE_METRIC") self.logger.info( "-- [container_memory] Query influxdb for raw metrics data --") output_filter = "max(value)/1024/1024 as value" time_filter = "time > %d AND time <= %d" % (start_time, end_time) tags_filter = "AND image!=''" group_tags = self.config.get("UTILIZATION", "CONTAINER_GROUP_TAGS") group_by_tags = group_tags + ",time(5s)" metric_name = "container_memory_working_set_bytes" try: container_mem_active_dict = self.query_influx_metric( metric_name, output_filter, time_filter, tags_filter, group_by_tags) except Exception as e: return JobStatus(status=Status.DB_ERROR, error=str(e)) metric_name = "container_memory_usage_bytes" try: container_mem_usage_dict = self.query_influx_metric( metric_name, output_filter, time_filter, tags_filter, group_by_tags) except Exception as e: return JobStatus(status=Status.DB_ERROR, error=str(e)) self.logger.info("-- [container_memory] Compute summary stats --") container_mem_summary = pd.DataFrame() output_metric = "container_memory_active" for k, df_active in container_mem_active_dict.items(): group_key = dict((x, y) for x, y in k[1]) df_active = df_active.dropna() try: self.influx_client_output.write_points(df_active, output_metric, group_key) except Exception as e: return JobStatus( status=Status.DB_ERROR, error="Unable to write query result for %s to influxDB: %s" % (output_metric, str(e))) image_key = "image=" + group_key['image'] container_mem_summary[image_key, 'active'] = df_active.value.describe( self.percentiles).drop( ['count', 'std', 'min']) output_metric = "container_memory_usage" for k, df_usage in container_mem_usage_dict.items(): group_key = dict((x, y) for x, y in k[1]) df_usage = df_usage.dropna() try: self.influx_client_output.write_points(df_usage, output_metric, group_key) except Exception as e: return JobStatus( status=Status.DB_ERROR, error="Unable to write query result for %s to influxDB: %s" % (output_metric, str(e))) image_key = "image=" + group_key['image'] container_mem_summary[image_key, 'usage'] = df_usage.value.describe( self.percentiles).drop( ['count', 'std', 'min']) container_mem_summary.rename(index={ '50%': "median", '95%': "p95", '99%': "p99" }, inplace=True) self.logger.debug("Computed container memory usage summary:\n %s" % container_mem_summary.to_json()) self.logger.info( "-- [container_memory] Query influxdb for current requests and limits --" ) try: mem_settings_dict = self.query_influx_metric( "container_spec_memory_limit_bytes", output_filter, time_filter, tags_filter, group_by_tags) except Exception as e: return JobStatus(status=Status.DB_ERROR, error=str(e)) output_metric = "container_mem_settings" try: current_mem_settings = self.get_container_mem_settings( mem_settings_dict, output_metric) except Exception as e: return JobStatus(status=Status.DB_ERROR, error=str(e)) self.logger.info("Current container memory settings (in MB):\n %s" % current_mem_settings) self.logger.info( "-- [container_memory] Compute requests and limits --") recommended_mem_settings = self.recommend_container_mem_settings( container_mem_summary) self.logger.info( "Recommended container memory settings (in MB):\n %s" % recommended_mem_settings) self.logger.info( "-- [container_memory] Store analysis results in mongodb --") this_config = { "stat_type": self.stat_type, "scaling_factor": self.scaling_factor, "base_metric": self.base_metric } results = self.construct_analysis_results("memory", container_mem_summary, recommended_mem_settings, current_mem_settings) sizing_result_doc = { "object_type": "container", "resource": "memory", "unit": "MB", "start_time": start_time, "end_time": end_time, "labels": group_tags.split(","), "config": {}, "results": results } try: self.store_analysis_results(sizing_result_doc) except Exception as e: return JobStatus(status=Status.DB_ERROR, error=str(e)) return JobStatus(status=Status.SUCCESS, data=sizing_result_doc) def query_influx_metric(self, metric_name, output_filter, time_filter, tags_filter, group_by_tags): try: influx_query = "SELECT %s FROM %s WHERE %s %s GROUP BY %s" % \ (output_filter, metric_name, time_filter, tags_filter, group_by_tags) self.logger.debug("Running influxDB read query: %s" % influx_query) metric_dict = self.influx_client_input.query(influx_query) except Exception as e: err_msg = "Unable to fetch %s from influxDB: %s" % (metric_name, str(e)) self.logger.error(err_msg) raise Exception(err_msg) return metric_dict def get_node_sizes(self, node_size_dict, output_name): current_sizes = {} for k, df in node_size_dict.items(): group_key = dict((x, y) for x, y in k[1]) #label_key = tuple(x+"="+y for x, y in k[1]) try: self.influx_client_output.write_points(df, output_name, group_key) except Exception as e: err_msg = ( "Unable to write query result for %s to influxDB: %s" % (output_name, str(e))) self.logger.error(err_msg) raise Exception(err_msg) node_key = "instance=" + group_key['instance'] current_sizes[node_key] = { 'size': math.ceil(df.loc[df.index[len(df) - 1], 'value']) } return current_sizes def get_container_cpu_settings(self, cpu_quota_dict, cpu_period_dict, output_name): current_settings = {} metric_name_period = list(cpu_period_dict.keys())[0][0] df_settings = pd.DataFrame() container_cpu_limit_dict = {} for k_quota, df_quota in cpu_quota_dict.items(): group_key = k_quota[1] df_period = cpu_period_dict[(metric_name_period, group_key)] df_limit = df_quota.divide(df_period).dropna() if group_key not in container_cpu_limit_dict.keys(): container_cpu_limit_dict[group_key] = df_limit else: df_comb = pd.merge_asof(container_cpu_limit_dict[group_key], df_limit, left_index=True, right_index=True, suffixes=('_1', '_2'), direction='nearest') container_cpu_limit_dict[group_key] = df_comb[[ 'value_1', 'value_2' ]].min(axis=1) for k, df in container_cpu_limit_dict.items(): group_key = dict((x, y) for x, y in k) try: self.influx_client_output.write_points(df, output_name, group_key) except Exception as e: err_msg = ( "Unable to write query result for %s to influxDB: %s" % (output_name, str(e))) self.logger.error(err_msg) raise Exception(err_msg) container_key = "image=" + group_key['image'] current_settings[container_key] = { 'requests': 0, 'limits': math.ceil(df.loc[df.index[len(df) - 1], 'value'] * 100) / float(100) } return current_settings def get_container_mem_settings(self, mem_settings_dict, output_name): current_settings = {} for k, df in mem_settings_dict.items(): group_key = dict((x, y) for x, y in k[1]) try: self.influx_client_output.write_points(df.dropna(), output_name, group_key) except Exception as e: err_msg = ( "Unable to write query result for %s to influxDB: %s" % (output_name, str(e))) self.logger.error(err_msg) raise Exception(err_msg) container_key = "image=" + group_key['image'] current_settings[container_key] = { 'requests': 0, 'limits': math.ceil(df.loc[df.index[len(df) - 1], 'value']) } return current_settings def recommend_node_sizes(self, node_summary): node_sizes = {} for column in node_summary: col_keys = node_summary[column].name group_key = col_keys[0] metric_type = col_keys[1] if metric_type == self.base_metric: node_sizes[group_key] = { 'size': math.ceil(node_summary[column][self.stat_type] * self.scaling_factor) } return node_sizes def recommend_container_cpu_settings(self, container_cpu_summary): container_cpu_settings = {} for column in container_cpu_summary: col_keys = container_cpu_summary[column].name group_key = col_keys[0] limit_value = max( float(self.config.get("UTILIZATION", "MIN_CPU_LIMITS")), math.ceil(container_cpu_summary[column][self.stat_type] * self.scaling_factor * 100) / 100.0) container_cpu_settings[group_key] = { "requests": math.ceil(container_cpu_summary[column][self.stat_type] * 100) / float(100), "limits": limit_value } return container_cpu_settings def recommend_container_mem_settings(self, container_mem_summary): container_mem_settings = {} for column in container_mem_summary: col_keys = container_mem_summary[column].name group_key = col_keys[0] if group_key not in container_mem_settings: container_mem_settings[group_key] = { 'requests': 0, 'limits': 0 } metric_type = col_keys[1] if metric_type == 'active': requests = math.ceil( container_mem_summary[column][self.stat_type]) container_mem_settings[group_key]['requests'] = requests else: # metric_type = 'usage' limits = math.ceil( container_mem_summary[column][self.stat_type] * self.scaling_factor) container_mem_settings[group_key]['limits'] = limits return container_mem_settings def construct_analysis_results(self, resource_type, usage_summary, recommended_settings, current_settings): summary_stats = {} results = [] for column in usage_summary: col_keys = usage_summary[column].name label_key = col_keys[0] metric_type = col_keys[1] summary_type = resource_type + "_" + metric_type if label_key not in summary_stats: summary_stats[label_key] = { summary_type: usage_summary[column].to_dict() } else: summary_stats[label_key][summary_type] = usage_summary[ column].to_dict() for label_key in summary_stats: label_values = {} for label in label_key.split(","): label_pair = label.split("=") label_values[label_pair[0]] = label_pair[1] if label_key not in current_settings.keys(): current_settings[label_key] = {'requests': 0, 'limit': 0} results.append({ "label_values": label_values, "summary_stats": summary_stats[label_key], "current_settings": current_settings[label_key], "recommended_settings": recommended_settings[label_key] }) return results def store_analysis_results(self, sizing_result_doc): sizing_result_doc["config"] = { "stat_type": self.stat_type, "scaling_factor": self.scaling_factor, "base_metric": self.base_metric } try: self.resultdb[self.results_collection].insert_one( sizing_result_doc) except Exception as e: self.logger.error( "Unable to store sizing result doc in MongoDB: %s" % str(sizing_result_doc)) raise Exception("Unable to store sizing result doc in MongoDB: " + str(e))