def test_create_retention_policy(self):
        """Test create retention policy for TestInfluxDBClient object."""
        cli = DataFrameClient(database='db')
        example_response = '{"results":[{}]}'

        with requests_mock.Mocker() as m:
            m.register_uri(requests_mock.POST,
                           "http://localhost:8086/query",
                           text=example_response)
            cli.create_retention_policy('somename', '1d', 4, database='db')

            self.assertEqual(
                m.last_request.qs['q'][0],
                'create retention policy "somename" on '
                '"db" duration 1d replication 4 shard duration 0s')
    def test_create_retention_policy(self):
        """Test create retention policy for TestInfluxDBClient object."""
        cli = DataFrameClient(database='db')
        example_response = '{"results":[{}]}'

        with requests_mock.Mocker() as m:
            m.register_uri(
                requests_mock.POST,
                "http://localhost:8086/query",
                text=example_response
            )
            cli.create_retention_policy(
                'somename', '1d', 4, database='db'
            )

            self.assertEqual(
                m.last_request.qs['q'][0],
                'create retention policy "somename" on '
                '"db" duration 1d replication 4 shard duration 0s'
            )
Ejemplo n.º 3
0
class tsdb(object):
    def __init__(self,
                 dbname,
                 host='localhost',
                 port=8086,
                 user='******',
                 password='******'):
        self.host = host
        self.port = port
        self.user = user
        self.password = password
        self.dbname = dbname
        self.client = None
        self.protocol = 'json'

    def _connect(self):
        if self.client is None:
            self.client = DataFrameClient(host=self.host,
                                          port=self.port,
                                          username=self.user,
                                          password=self.password,
                                          database=self.dbname)
            #self.client.switch_database(self.dbname)

    def _disconnect(self):
        if self.cleint is not None:
            self.client.close()
            self.client = None

    def _reconnet(self):
        self._disconnect()
        self._connect()

    def create_db(self):
        self._connect()
        dbs = self.client.get_list_database()
        for e in dbs:
            if self.dbname in e.values():
                logger.debug("Database {} is already exist.".format(
                    self.dbname))
                return

        logger.info("Creating database:{}".format(self.dbname))
        self.client.create_database(self.dbname)
        #self._set_retantion_policy()

    def _set_retantion_policy(self):
        self._connect()
        self.client.create_retention_policy(name='raw',
                                            duration='12h',
                                            replication=1,
                                            default=True)
        self.client.create_retention_policy(name='cooked',
                                            duration='52w',
                                            replication=1,
                                            default=False)

    def check_db(self):
        self._connect()
        db = self.client.get_list_database()
        ms = self.client.get_list_measurements()
        rp = self.client.get_list_retention_policies(self.dbname)
        user = self.client.get_list_users()

        print('db: {}, measurements: {}'.format(db, ms))
        print('retention policy: {}'.format(rp))
        print('users: {}'.format(user))

    def insert(self, df, measurement, tags=None):
        self._connect()
        try:
            result = self.client.write_points(df,
                                              measurement,
                                              tags=tags,
                                              time_precision='n',
                                              protocol=self.protocol)
        except:
            logger.info('influxdb write error')
            result = False
        return result

    def query(self, sql):
        self._connect()
        result = self.client.query(sql)
        return result
Ejemplo n.º 4
0
class MetricsConsumer(object):
    def __init__(self, app_slo, app_config_file, config_file, app_id):
        with open(config_file) as json_data:
            self.config = json.load(json_data)

        with open(app_config_file) as json_data:
            self.app_metric_config = json.load(json_data)

        if "metric" in app_slo:
            self.app_metric_config["metric"] = app_slo["metric"]

        if "threshold" in app_slo:
            self.app_metric_config["threshold"] = app_slo["threshold"]

        self.group_keys = {"intel/docker": "io.kubernetes.pod.name"}
        self.default_group_key = "nodename"
        influx_host = config.get("INFLUXDB", "HOST")
        influx_port = config.get("INFLUXDB", "PORT")
        influx_user = config.get("INFLUXDB", "USERNAME")
        influx_password = config.get("INFLUXDB", "PASSWORD")
        self.influx_client = DataFrameClient(
            influx_host, influx_port, influx_user, influx_password,
            config.get("INFLUXDB", "RAW_DB_NAME"))
        self.app_influx_client = DataFrameClient(
            influx_host, influx_port, influx_user, influx_password,
            config.get("INFLUXDB", "APP_DB_NAME"))
        self.deployment_id = ''

        derived_db = config.get("INFLUXDB", "DERIVED_METRIC_DB_NAME")
        self.derived_influx_client = DataFrameClient(influx_host, influx_port,
                                                     influx_user,
                                                     influx_password,
                                                     derived_db)
        self.derived_influx_client.create_database(derived_db)
        self.derived_influx_client.create_retention_policy(
            'derived_metric_policy', '5w', 1, default=True)
        self.logger = logging.getLogger(app_id)

    def get_app_threshold(self):
        threshold_config = self.app_metric_config["threshold"]
        value = float(threshold_config["value"])
        unit = threshold_config["unit"]
        # We app is measured in seconds
        if unit == "ms":
            value = value / 1000.
        return value

    def get_app_metric(self, start_time, end_time, is_derived=True):
        self.logger.info(
            "Start processing app metric with app_metric_config %s" %
            self.app_metric_config)
        metric_name = self.app_metric_config["metric"]["name"]
        aggregation = self.app_metric_config["analysis"]["aggregation"]
        self.incident_type = self.app_metric_config["type"]
        self.incident_metric = self.app_metric_config["metric"]
        self.incident_threshold = self.app_metric_config["threshold"]

        time_filter = "WHERE time >= %d AND time <= %d" % (start_time,
                                                           end_time)
        app_metric_query = ("SELECT time, %s(value) as value FROM \"%s\" %s" %
                            (aggregation, metric_name, time_filter))

        if "tags" in self.app_metric_config["metric"]:
            tags = self.app_metric_config["metric"]["tags"]
            tags_filter = " AND ".join(
                ["\"%s\"='%s'" % (tag["key"], tag["value"]) for tag in tags])
            app_metric_query += (" AND %s" % (tags_filter))

        app_metric_query += (" GROUP BY time(%ds) fill(none)" %
                             (SAMPLE_INTERVAL))
        self.logger.debug("app_metric_query = %s" % (app_metric_query))
        df = self.app_influx_client.query(app_metric_query)
        self.logger.debug("App metric query completed")
        if metric_name not in df:
            return None

        if is_derived == False:
            return df[metric_name]

        if df[metric_name] is not None:
            slo_state = WindowState(
                self.app_metric_config["analysis"]["observation_window_sec"],
                self.get_app_threshold(),
                SAMPLE_INTERVAL,
            )
            df[metric_name]["value"] = df[metric_name].apply(
                lambda row: slo_state.compute_derived_value(
                    row.name.value, row.value),
                axis=1,
            )
        return df[metric_name]

    def get_raw_metrics(self, start_time, end_time, app_metric=None):
        metrics_result = MetricsResults(self.derived_influx_client,
                                        is_derived=False)
        if app_metric is not None:
            metrics_result.set_app_metric(
                app_metric, self.app_metric_config["metric"]["name"])
        time_filter = "WHERE time >= %d AND time <= %d" % (start_time,
                                                           end_time)

        for metric_config in self.config:
            metric_source = str(metric_config["metric_name"])
            group_name = self.default_group_key
            is_container_metric = False
            for k, v in self.group_keys.items():
                if k in metric_source:
                    group_name = v
                    is_container_metric = True
                    break

            if metric_source.startswith("/"):
                metric_source = metric_source[1:]

            # construct tags_filter if needed for this metric
            tags_filter = None
            if "tags" in metric_config:
                tags = metric_config["tags"]
                tags_filter = " AND ".join(
                    ["\"%s\"='%s'" % (k, v) for k, v in tags.items()])

            raw_metrics_query = ("SELECT * FROM \"%s\" %s" %
                                 (metric_source, time_filter))
            if tags_filter:
                raw_metrics_query += (" AND %s" % (tags_filter))
            raw_metrics = self.influx_client.query(raw_metrics_query)
            metrics_thresholds = {}
            raw_metrics_len = len(raw_metrics)
            if raw_metrics_len == 0:
                self.logger.info(
                    "Unable to find data for %s, skipping this metric..." %
                    (metric_source))
                continue

            for metric_group_name in raw_metrics[metric_source][
                    group_name].unique():
                if not metric_group_name or metric_group_name == "":
                    self.logger.info("Unable to find %s in metric %s" %
                                     (metric_group_name, metric_source))
                    continue

            df = raw_metrics[metric_source]
            dfg = df.groupby(group_name)
            metrics_result.add_metric(
                metric_source, metric_source, is_container_metric, dfg,
                metric_config["resource"],
                metric_config["analysis"]["observation_window_sec"],
                metric_config["threshold"]["value"],
                metric_config["threshold"]["type"],
                metric_config["threshold"]["unit"])
            return metrics_result

    def get_derived_metrics(self, start_time, end_time, app_metric=None):
        derived_metrics_result = MetricsResults(self.derived_influx_client,
                                                is_derived=True)
        if app_metric is not None:
            derived_metrics_result.set_app_metric(
                app_metric, self.app_metric_config["metric"]["name"] + "/" +
                self.app_metric_config["type"])
        node_metric_keys = "value,nodename,deploymentId"
        container_metric_keys = "value,\"io.kubernetes.pod.name\",nodename,deploymentId"
        time_filter = "WHERE time > %d AND time <= %d" % (start_time, end_time)

        self.logger.info("Start processing infrastructure metrics")
        for metric_config in self.config:
            metric_source = str(metric_config["metric_name"])
            group_name = self.default_group_key
            is_container_metric = False
            for k, v in self.group_keys.items():
                if k in metric_source:
                    group_name = v
                    is_container_metric = True
                    break

            metric_type = metric_config["type"]
            new_metric_name = metric_source + "/" + metric_type
            if metric_source.startswith("/"):
                metric_source = metric_source[1:]

            # construct tags_filter if needed for this metric
            tags_filter = None
            if "tags" in metric_config:
                tags = metric_config["tags"]
                tags_filter = " AND ".join(
                    ["\"%s\"='%s'" % (k, v) for k, v in tags.items()])

            # fetch raw metric values from influxdb
            raw_metrics = None
            if is_container_metric:
                raw_metrics_query = (
                    "SELECT %s FROM \"%s\" %s" %
                    (container_metric_keys, metric_source, time_filter))
            else:
                raw_metrics_query = (
                    "SELECT %s FROM \"%s\" %s" %
                    (node_metric_keys, metric_source, time_filter))
            if tags_filter:
                raw_metrics_query += (" AND %s" % (tags_filter))
            self.logger.debug("raw metrics for derived metrics query = %s" %
                              (raw_metrics_query))
            raw_metrics = self.influx_client.query(raw_metrics_query)
            self.logger.debug("raw metrics query completed")
            if len(raw_metrics) == 0:
                self.logger.info(
                    "Unable to find data for %s; skipping this metric..." %
                    (metric_source))
                continue
            metric_df = raw_metrics[metric_source]
            metric_group_states = {}
            self.deployment_id = metric_df.loc[:, "deploymentId"][0]
            # fetch normalizer metric values if normalization is needed
            normalizer_metrics = None
            if "normalizer" in metric_config:
                normalizer = str(metric_config["normalizer"])
                new_metric_name = metric_source + "_normalized/" + metric_type
                if normalizer.startswith("/"):
                    normalizer = normalizer[1:]

                if is_container_metric:
                    normalizer_metrics_query = (
                        "SELECT %s FROM \"%s\" %s" %
                        (container_metric_keys, normalizer, time_filter))
                else:
                    normalizer_metrics_query = (
                        "SELECT %s FROM \"%s\" %s" %
                        (node_metric_keys, normalizer, time_filter))
                if tags_filter:
                    normalizer_metrics_query += (" AND %s" % (tags_filter))
                self.logger.debug("normalizer metrics query = %s" %
                                  (normalizer_metrics_query))
                normalizer_metrics = self.influx_client.query(
                    normalizer_metrics_query)
                if len(normalizer_metrics) == 0:
                    self.logger.info(
                        "Unable to find data for normalizer %s; skipping metric %s..."
                        % (normalizer, metric_source))
                    continue
                normalizer_df = normalizer_metrics[normalizer]
                if normalizer_df["value"].max() == 0:
                    self.logger.info(
                        "All zero values in normalizer %s, skipping metric %s..."
                        % (normalizer, metric_source))
                    continue

            self.logger.debug(
                "Converting raw metric %s\n  into derived metric %s" %
                (metric_source, new_metric_name))

            # process metric values for each group
            # metric_group_name = nodename for node metrics, pod.name for container metrics
            for metric_group_name in raw_metrics[metric_source][
                    group_name].unique():
                if not metric_group_name or metric_group_name == "":
                    self.logger.info("Unable to find %s in metric %s" %
                                     (metric_group_name, metric_source))
                    continue
                if metric_group_name not in metric_group_states:
                    new_state = WindowState(
                        metric_config["observation_window_sec"],
                        metric_config["threshold"]["value"],
                        SAMPLE_INTERVAL,
                    )
                    metric_group_states[metric_group_name] = new_state

                metric_group_ind = metric_df.loc[metric_df[group_name] ==
                                                 metric_group_name].index

                # perform normalization if needed for raw metrics in each group
                if normalizer_metrics:
                    normalizer_group_ind = normalizer_df.loc[
                        normalizer_df[group_name] == metric_group_name].index

                    if normalizer_df.loc[normalizer_group_ind,
                                         "value"].max() == 0:
                        self.logger.debug(
                            "Normalizer metric has all zeros for group %s; " %
                            (metric_group_name) +
                            "dropping this group from the raw metric...")
                        metric_df = metric_df.drop(metric_group_ind)
                        continue

                    if len(normalizer_group_ind) != len(metric_group_ind):
                        self.logger.warning(
                            "Normalizer does not have equal length as raw metric; "
                            + "adjusting...")
                        minlen = min(len(metric_group_ind),
                                     len(normalizer_group_ind))
                        metric_group_ind = metric_group_ind[:minlen]
                        normalizer_group_ind = normalizer_group_ind[:minlen]

                    metric_df.loc[metric_group_ind, "value"] = (
                        metric_df.loc[metric_group_ind, "value"] /
                        normalizer_df.loc[normalizer_group_ind, "value"].data)

                # compute derived metric values using configured threshold info
                self.logger.debug(
                    "raw metric before applying threshold for group %s" %
                    (metric_group_name))
                self.logger.debug(
                    metric_df.loc[metric_group_ind,
                                  [group_name, "value"]].to_string(
                                      index=False))
                metric_df.loc[
                    metric_group_ind,
                    "value"] = metric_df.loc[metric_group_ind].apply(
                        lambda row: metric_group_states[row[group_name]].
                        compute_derived_value(row.name.value, row.value),
                        axis=1,
                    )
                self.logger.debug(
                    "derived metric after applying threshold for group %s" %
                    (metric_group_name))
                self.logger.debug(
                    metric_df.loc[metric_group_ind,
                                  [group_name, "value"]].to_string(
                                      index=False))

            metric_dfg = metric_df.groupby(group_name)
            derived_metrics_result.add_metric(
                metric_source, new_metric_name, is_container_metric,
                metric_dfg, metric_config["resource"],
                metric_config["observation_window_sec"],
                metric_config["threshold"]["value"],
                metric_config["threshold"]["type"],
                metric_config["threshold"]["unit"])

        return derived_metrics_result
Ejemplo n.º 5
0
class SizingAnalyzer(object):
    def __init__(self, config):
        self.config = config
        self.logger = get_logger(__name__,
                                 log_level=("UTILIZATION", "LOGLEVEL"))

        self.percentiles = ast.literal_eval(
            config.get("UTILIZATION", "PERCENTILES"))
        self.stat_type = config.get("UTILIZATION", "DEFAULT_STAT_TYPE")
        self.scaling_factor = float(
            self.config.get("UTILIZATION", "DEFAULT_SCALING_FACTOR"))

        influx_host = config.get("INFLUXDB", "HOST")
        influx_port = config.get("INFLUXDB", "PORT")
        influx_user = config.get("INFLUXDB", "USERNAME")
        influx_password = config.get("INFLUXDB", "PASSWORD")
        input_db = config.get("INFLUXDB", "SIZING_INPUT_DB_NAME")
        output_db = config.get("INFLUXDB", "SIZING_OUTPUT_DB_NAME")
        self.resultdb = resultdb
        self.results_collection = config.get("UTILIZATION",
                                             "SIZING_RESULTS_COLLECTION")

        self.influx_client_input = DataFrameClient(influx_host, influx_port,
                                                   influx_user,
                                                   influx_password, input_db)

        self.influx_client_output = DataFrameClient(influx_host, influx_port,
                                                    influx_user,
                                                    influx_password, output_db)
        self.influx_client_output.create_database(output_db)
        self.influx_client_output.create_retention_policy(
            'sizing_result_policy', '4w', 1, default=True)

    def analyze_node_cpu(self,
                         start_time,
                         end_time,
                         stat_type=None,
                         scaling_factor=None):
        if stat_type is not None:
            self.stat_type = stat_type

        if scaling_factor is not None:
            self.scaling_factor = scaling_factor

        self.base_metric = 'usage'

        self.logger.info(
            "-- [node_cpu] Query influxdb for raw metrics data --")
        output_filter = "derivative(sum(value), 1s) as usage"
        time_filter = "time > %d AND time <= %d" % (start_time, end_time)
        tags_filter = "AND mode=~ /(user|system)/"
        group_tags = self.config.get("UTILIZATION", "NODE_GROUP_TAGS")
        group_by_tags = group_tags + ",time(1ms)"

        metric_name = "node_cpu"
        try:
            node_cpu_usage_dict = self.query_influx_metric(
                metric_name, output_filter, time_filter, tags_filter,
                group_by_tags)
        except Exception as e:
            return JobStatus(status=Status.DB_ERROR, error=str(e))

        self.logger.info("-- [node_cpu] Compute summary stats --")
        node_cpu_summary = pd.DataFrame()
        new_metric = "node_cpu_usage"
        for k, df in node_cpu_usage_dict.items():
            group_key = dict((x, y) for x, y in k[1])
            #df.drop(df.index[(df.usage > MAX_CORES) | (df.usage < 0)], inplace=True)
            try:
                self.influx_client_output.write_points(df, new_metric,
                                                       group_key)
            except Exception as e:
                return JobStatus(
                    status=Status.DB_ERROR,
                    error="Unable to write query result for %s to influxDB: %s"
                    % (new_metric, str(e)))
            node_key = "instance=" + group_key['instance']
            node_cpu_summary[node_key, self.base_metric] = df.usage.describe(
                self.percentiles).drop(['count', 'std', 'min'])

        node_cpu_summary.rename(index={
            '50%': "median",
            '95%': "p95",
            '99%': "p99"
        },
                                inplace=True)
        self.logger.debug("Computed node cpu usage summary:\n %s" %
                          node_cpu_summary.to_json())

        self.logger.info(
            "-- [node_cpu] Query influxdb for current node configs --")
        metric_name = "machine_cpu_cores"
        try:
            node_cpu_size_dict = self.query_influx_metric(
                metric_name, "value", time_filter, "", group_tags)
        except Exception as e:
            return JobStatus(status=Status.DB_ERROR, error=str(e))
        current_cpu_sizes = self.get_node_sizes(node_cpu_size_dict,
                                                "node_cpu_size")
        self.logger.info("Current node cpu sizes (in #cores):\n %s" %
                         current_cpu_sizes)

        self.logger.info("-- [node_cpu] Compute sizing recommendation --")
        recommended_cpu_sizes = self.recommend_node_sizes(node_cpu_summary)
        self.logger.info("Recommended node cpu sizes (in #cores):\n %s" %
                         recommended_cpu_sizes)

        self.logger.info("-- [node_cpu] Store analysis results in mongodb --")
        results = self.construct_analysis_results("cpu", node_cpu_summary,
                                                  recommended_cpu_sizes,
                                                  current_cpu_sizes)
        sizing_result_doc = {
            "object_type": "node",
            "resource": "cpu",
            "unit": "cores",
            "start_time": start_time,
            "end_time": end_time,
            "labels": group_tags.split(","),
            "config": {},
            "results": results
        }
        try:
            self.store_analysis_results(sizing_result_doc)
        except Exception as e:
            return JobStatus(status=Status.DB_ERROR, error=str(e))

        return JobStatus(status=Status.SUCCESS, data=sizing_result_doc)

    def analyze_node_memory(self,
                            start_time,
                            end_time,
                            stat_type=None,
                            scaling_factor=None,
                            base_metric=None):
        if stat_type is not None:
            self.stat_type = stat_type

        if scaling_factor is not None:
            self.scaling_factor = scaling_factor

        if base_metric is not None:
            self.base_metric = base_metric
        else:
            self.base_metric = self.config.get("UTILIZATION",
                                               "MEMORY_BASE_METRIC")

        self.logger.info(
            "-- [node_memory] Query influxdb for raw metrics data --")
        output_filter = "value/1024/1024/1024"
        time_filter = "time > %d AND time <= %d" % (start_time, end_time)
        group_tags = self.config.get("UTILIZATION", "NODE_GROUP_TAGS")

        metric_name_active = "node_memory_Active"
        try:
            node_mem_active_dict = self.query_influx_metric(
                metric_name_active, output_filter, time_filter, "", group_tags)
        except Exception as e:
            return JobStatus(status=Status.DB_ERROR, error=str(e))

        metric_name_total = "node_memory_MemTotal"
        try:
            node_mem_total_dict = self.query_influx_metric(
                metric_name_total, output_filter, time_filter, "", group_tags)
        except Exception as e:
            return JobStatus(status=Status.DB_ERROR, error=str(e))

        metric_name_free = "node_memory_MemFree"
        try:
            node_mem_free_dict = self.query_influx_metric(
                metric_name_free, output_filter, time_filter, "", group_tags)
        except Exception as e:
            return JobStatus(status=Status.DB_ERROR, error=str(e))

        self.logger.info("-- [node_memory] Compute summary stats --")
        node_mem_summary = pd.DataFrame()
        new_metric = "node_memory_active"
        for k, df_active in node_mem_active_dict.items():
            group_key = dict((x, y) for x, y in k[1])
            try:
                self.influx_client_output.write_points(df_active, new_metric,
                                                       group_key)
            except Exception as e:
                return JobStatus(
                    status=Status.DB_ERROR,
                    error="Unable to write query result for %s to influxDB: %s"
                    % (new_metric, str(e)))
            node_key = "instance=" + group_key['instance']
            node_mem_summary[node_key, 'active'] = df_active.value.describe(
                self.percentiles).drop(['count', 'std', 'min'])

        new_metric = "node_memory_usage"
        for k in node_mem_total_dict.keys():
            group_key = dict((x, y) for x, y in k[1])
            df_total = node_mem_total_dict[k]
            k_free = (metric_name_free, k[1])
            df_free = node_mem_free_dict[k_free]

            df_usage = df_total - df_free
            try:
                self.influx_client_output.write_points(df_usage, new_metric,
                                                       group_key)
            except Exception as e:
                return JobStatus(
                    status=Ststus.DB_ERROR,
                    error="Unable to write query result for %s to influxDB: %s"
                    % (new_metric, str(e)))
            node_key = "instance=" + group_key['instance']
            node_mem_summary[node_key, 'usage'] = df_usage.value.describe(
                self.percentiles).drop(['count', 'std', 'min'])

        node_mem_summary.rename(index={
            '50%': "median",
            '95%': "p95",
            '99%': "p99"
        },
                                inplace=True)
        self.logger.debug("Computed node memory usage summary:\n %s" %
                          node_mem_summary.to_json())

        self.logger.info(
            "-- [node_memory] Query influxdb for current node configs --")
        try:
            node_mem_size_dict = self.query_influx_metric(
                "machine_memory_bytes", output_filter, time_filter, "",
                group_tags)
        except Exception as e:
            return JobStatus(status=Status.DB_ERROR, error=str(e))
        current_mem_sizes = self.get_node_sizes(node_mem_size_dict,
                                                "node_memory_size")
        self.logger.info("Current node memory sizes (in GB):\n %s" %
                         current_mem_sizes)

        self.logger.info("-- [node_memory] Compute sizing recommendation --")
        recommended_mem_sizes = self.recommend_node_sizes(node_mem_summary)
        self.logger.info("Recommended node memory sizes (in GB):\n %s" %
                         recommended_mem_sizes)

        self.logger.info(
            "-- [node_memory] Store analysis results in mongodb --")
        results = self.construct_analysis_results("memory", node_mem_summary,
                                                  recommended_mem_sizes,
                                                  current_mem_sizes)
        sizing_result_doc = {
            "object_type": "node",
            "resource": "memory",
            "unit": "GB",
            "start_time": start_time,
            "end_time": end_time,
            "labels": group_tags.split(","),
            "config": {},
            "results": results
        }

        try:
            self.store_analysis_results(sizing_result_doc)
        except Exception as e:
            return JobStatus(status=Status.DB_ERROR, error=str(e))

        return JobStatus(status=Status.SUCCESS, data=sizing_result_doc)

    def analyze_container_cpu(self,
                              start_time,
                              end_time,
                              stat_type=None,
                              scaling_factor=None):
        if stat_type is not None:
            self.stat_type = stat_type

        if scaling_factor is not None:
            self.scaling_factor = scaling_factor

        self.base_metric = 'usage'

        self.logger.info(
            "-- [container_cpu] Query influxdb for raw metrics data --")
        output_filter = "derivative(sum(value), 1s) as usage"
        time_filter = "time > %d AND time <= %d" % (start_time, end_time)
        tags_filter = "AND image!=''"
        group_tags = self.config.get("UTILIZATION", "CONTAINER_GROUP_TAGS")
        group_by_tags = group_tags + ",pod_name,time(1ms)"

        metric_name_usr = "******"
        try:
            container_cpu_user_dict = self.query_influx_metric(
                metric_name_usr, output_filter, time_filter, tags_filter,
                group_by_tags)
        except Exception as e:
            return JobStatus(status=Status.DB_ERROR, error=str(e))

        metric_name_sys = "container_cpu_system_seconds_total"
        try:
            container_cpu_sys_dict = self.query_influx_metric(
                metric_name_sys, output_filter, time_filter, tags_filter,
                group_by_tags)
        except Exception as e:
            return JobStatus(status=Status.DB_ERROR, error=str(e))

        self.logger.info("-- [container_cpu] Compute summary stats --")
        df_usage = pd.DataFrame()
        container_cpu_usage_dict = {}
        for k_user, df_user in container_cpu_user_dict.items():
            group_key = k_user[1]
            df_sys = container_cpu_sys_dict[(metric_name_sys, group_key)]
            df_usage = (df_user + df_sys).astype('float32')

            if group_key not in container_cpu_usage_dict.keys():
                container_cpu_usage_dict[group_key] = df_usage
            else:
                df_comb = pd.merge_asof(container_cpu_usage_dict[group_key],
                                        df_usage,
                                        left_index=True,
                                        right_index=True,
                                        suffixes=('_1', '_2'),
                                        direction='nearest')
                container_cpu_usage_dict[group_key].usage = df_comb[[
                    'usage_1', 'usage_2'
                ]].max(axis=1)

        container_cpu_summary = pd.DataFrame()
        output_metric = "container_cpu_usage"
        for k, df_usage in container_cpu_usage_dict.items():
            group_key = dict((x, y) for x, y in k)
            df_usage = df_usage.dropna()
            try:
                self.influx_client_output.write_points(df_usage, output_metric,
                                                       group_key)
            except Exception as e:
                return JobStatus(
                    status=Status.DB_ERROR,
                    error="Unable to write query result for %s to influxDB: %s"
                    % (output_metric, str(e)))
            image_key = "image=" + group_key['image']
            container_cpu_summary[image_key,
                                  self.base_metric] = df_usage.usage.describe(
                                      self.percentiles).drop(
                                          ['count', 'std', 'min'])

        container_cpu_summary.rename(index={
            '50%': "median",
            '95%': "p95",
            '99%': "p99"
        },
                                     inplace=True)
        self.logger.debug("Computed container cpu usage summary:\n %s" %
                          container_cpu_summary.to_json())

        self.logger.info(
            "-- [container_cpu] Query influxdb for current requests and limits --"
        )
        output_filter = "sum(value) as value"
        group_by_tags = group_tags + ",pod_name,time(5s)"
        try:
            cpu_quota_dict = self.query_influx_metric(
                "container_spec_cpu_quota", output_filter, time_filter,
                tags_filter, group_by_tags)
        except Exception as e:
            return JobStatus(status=Status.DB_ERROR, error=str(e))

        try:
            cpu_period_dict = self.query_influx_metric(
                "container_spec_cpu_period", output_filter, time_filter,
                tags_filter, group_by_tags)
        except Exception as e:
            return JobStatus(status=Status.DB_ERROR, error=str(e))

        output_metric = "container_cpu_settings"
        try:
            current_cpu_settings = self.get_container_cpu_settings(
                cpu_quota_dict, cpu_period_dict, output_metric)
        except Exception as e:
            return JobStatus(status=Status.DB_ERROR, error=str(e))
        self.logger.info("Current container cpu settings (in #cores):\n %s" %
                         current_cpu_settings)

        self.logger.info("-- [container_cpu] Compute requests and limits --")
        recommended_cpu_settings = self.recommend_container_cpu_settings(
            container_cpu_summary)
        self.logger.info(
            "Recommended container cpu settings (in #cores):\n %s" %
            recommended_cpu_settings)

        self.logger.info(
            "-- [container_cpu] Store analysis results in mongodb --")
        results = self.construct_analysis_results("cpu", container_cpu_summary,
                                                  recommended_cpu_settings,
                                                  current_cpu_settings)
        sizing_result_doc = {
            "object_type": "container",
            "resource": "cpu",
            "unit": "cores",
            "start_time": start_time,
            "end_time": end_time,
            "labels": group_tags.split(","),
            "config": {},
            "results": results
        }

        try:
            self.store_analysis_results(sizing_result_doc)
        except Exception as e:
            return JobStatus(status=Status.DB_ERROR, error=str(e))

        return JobStatus(status=Status.SUCCESS, data=sizing_result_doc)

    def analyze_container_memory(self,
                                 start_time,
                                 end_time,
                                 stat_type=None,
                                 scaling_factor=None,
                                 base_metric=None):
        if stat_type is not None:
            self.stat_type = stat_type

        if scaling_factor is not None:
            self.scaling_factor = scaling_factor

        if base_metric is not None:
            self.base_metric = base_metric
        else:
            self.base_metric = self.config.get("UTILIZATION",
                                               "MEMORY_BASE_METRIC")

        self.logger.info(
            "-- [container_memory] Query influxdb for raw metrics data --")
        output_filter = "max(value)/1024/1024 as value"
        time_filter = "time > %d AND time <= %d" % (start_time, end_time)
        tags_filter = "AND image!=''"
        group_tags = self.config.get("UTILIZATION", "CONTAINER_GROUP_TAGS")
        group_by_tags = group_tags + ",time(5s)"

        metric_name = "container_memory_working_set_bytes"
        try:
            container_mem_active_dict = self.query_influx_metric(
                metric_name, output_filter, time_filter, tags_filter,
                group_by_tags)
        except Exception as e:
            return JobStatus(status=Status.DB_ERROR, error=str(e))

        metric_name = "container_memory_usage_bytes"
        try:
            container_mem_usage_dict = self.query_influx_metric(
                metric_name, output_filter, time_filter, tags_filter,
                group_by_tags)
        except Exception as e:
            return JobStatus(status=Status.DB_ERROR, error=str(e))

        self.logger.info("-- [container_memory] Compute summary stats --")
        container_mem_summary = pd.DataFrame()
        output_metric = "container_memory_active"
        for k, df_active in container_mem_active_dict.items():
            group_key = dict((x, y) for x, y in k[1])
            df_active = df_active.dropna()
            try:
                self.influx_client_output.write_points(df_active,
                                                       output_metric,
                                                       group_key)
            except Exception as e:
                return JobStatus(
                    status=Status.DB_ERROR,
                    error="Unable to write query result for %s to influxDB: %s"
                    % (output_metric, str(e)))
            image_key = "image=" + group_key['image']
            container_mem_summary[image_key,
                                  'active'] = df_active.value.describe(
                                      self.percentiles).drop(
                                          ['count', 'std', 'min'])

        output_metric = "container_memory_usage"
        for k, df_usage in container_mem_usage_dict.items():
            group_key = dict((x, y) for x, y in k[1])
            df_usage = df_usage.dropna()
            try:
                self.influx_client_output.write_points(df_usage, output_metric,
                                                       group_key)
            except Exception as e:
                return JobStatus(
                    status=Status.DB_ERROR,
                    error="Unable to write query result for %s to influxDB: %s"
                    % (output_metric, str(e)))
            image_key = "image=" + group_key['image']
            container_mem_summary[image_key,
                                  'usage'] = df_usage.value.describe(
                                      self.percentiles).drop(
                                          ['count', 'std', 'min'])

        container_mem_summary.rename(index={
            '50%': "median",
            '95%': "p95",
            '99%': "p99"
        },
                                     inplace=True)
        self.logger.debug("Computed container memory usage summary:\n %s" %
                          container_mem_summary.to_json())

        self.logger.info(
            "-- [container_memory] Query influxdb for current requests and limits --"
        )
        try:
            mem_settings_dict = self.query_influx_metric(
                "container_spec_memory_limit_bytes", output_filter,
                time_filter, tags_filter, group_by_tags)
        except Exception as e:
            return JobStatus(status=Status.DB_ERROR, error=str(e))

        output_metric = "container_mem_settings"
        try:
            current_mem_settings = self.get_container_mem_settings(
                mem_settings_dict, output_metric)
        except Exception as e:
            return JobStatus(status=Status.DB_ERROR, error=str(e))
        self.logger.info("Current container memory settings (in MB):\n %s" %
                         current_mem_settings)

        self.logger.info(
            "-- [container_memory] Compute requests and limits --")
        recommended_mem_settings = self.recommend_container_mem_settings(
            container_mem_summary)
        self.logger.info(
            "Recommended container memory settings (in MB):\n %s" %
            recommended_mem_settings)

        self.logger.info(
            "-- [container_memory] Store analysis results in mongodb --")
        this_config = {
            "stat_type": self.stat_type,
            "scaling_factor": self.scaling_factor,
            "base_metric": self.base_metric
        }
        results = self.construct_analysis_results("memory",
                                                  container_mem_summary,
                                                  recommended_mem_settings,
                                                  current_mem_settings)
        sizing_result_doc = {
            "object_type": "container",
            "resource": "memory",
            "unit": "MB",
            "start_time": start_time,
            "end_time": end_time,
            "labels": group_tags.split(","),
            "config": {},
            "results": results
        }
        try:
            self.store_analysis_results(sizing_result_doc)
        except Exception as e:
            return JobStatus(status=Status.DB_ERROR, error=str(e))

        return JobStatus(status=Status.SUCCESS, data=sizing_result_doc)

    def query_influx_metric(self, metric_name, output_filter, time_filter,
                            tags_filter, group_by_tags):
        try:
            influx_query = "SELECT %s FROM %s WHERE %s %s GROUP BY %s" % \
                           (output_filter, metric_name, time_filter, tags_filter, group_by_tags)
            self.logger.debug("Running influxDB read query: %s" % influx_query)
            metric_dict = self.influx_client_input.query(influx_query)
        except Exception as e:
            err_msg = "Unable to fetch %s from influxDB: %s" % (metric_name,
                                                                str(e))
            self.logger.error(err_msg)
            raise Exception(err_msg)

        return metric_dict

    def get_node_sizes(self, node_size_dict, output_name):
        current_sizes = {}

        for k, df in node_size_dict.items():
            group_key = dict((x, y) for x, y in k[1])
            #label_key = tuple(x+"="+y for x, y in k[1])
            try:
                self.influx_client_output.write_points(df, output_name,
                                                       group_key)
            except Exception as e:
                err_msg = (
                    "Unable to write query result for %s to influxDB: %s" %
                    (output_name, str(e)))
                self.logger.error(err_msg)
                raise Exception(err_msg)

            node_key = "instance=" + group_key['instance']
            current_sizes[node_key] = {
                'size': math.ceil(df.loc[df.index[len(df) - 1], 'value'])
            }

        return current_sizes

    def get_container_cpu_settings(self, cpu_quota_dict, cpu_period_dict,
                                   output_name):
        current_settings = {}

        metric_name_period = list(cpu_period_dict.keys())[0][0]

        df_settings = pd.DataFrame()
        container_cpu_limit_dict = {}
        for k_quota, df_quota in cpu_quota_dict.items():
            group_key = k_quota[1]
            df_period = cpu_period_dict[(metric_name_period, group_key)]
            df_limit = df_quota.divide(df_period).dropna()

            if group_key not in container_cpu_limit_dict.keys():
                container_cpu_limit_dict[group_key] = df_limit
            else:
                df_comb = pd.merge_asof(container_cpu_limit_dict[group_key],
                                        df_limit,
                                        left_index=True,
                                        right_index=True,
                                        suffixes=('_1', '_2'),
                                        direction='nearest')
                container_cpu_limit_dict[group_key] = df_comb[[
                    'value_1', 'value_2'
                ]].min(axis=1)

        for k, df in container_cpu_limit_dict.items():
            group_key = dict((x, y) for x, y in k)
            try:
                self.influx_client_output.write_points(df, output_name,
                                                       group_key)
            except Exception as e:
                err_msg = (
                    "Unable to write query result for %s to influxDB: %s" %
                    (output_name, str(e)))
                self.logger.error(err_msg)
                raise Exception(err_msg)

            container_key = "image=" + group_key['image']
            current_settings[container_key] = {
                'requests':
                0,
                'limits':
                math.ceil(df.loc[df.index[len(df) - 1], 'value'] * 100) /
                float(100)
            }

        return current_settings

    def get_container_mem_settings(self, mem_settings_dict, output_name):
        current_settings = {}

        for k, df in mem_settings_dict.items():
            group_key = dict((x, y) for x, y in k[1])
            try:
                self.influx_client_output.write_points(df.dropna(),
                                                       output_name, group_key)
            except Exception as e:
                err_msg = (
                    "Unable to write query result for %s to influxDB: %s" %
                    (output_name, str(e)))
                self.logger.error(err_msg)
                raise Exception(err_msg)

            container_key = "image=" + group_key['image']
            current_settings[container_key] = {
                'requests': 0,
                'limits': math.ceil(df.loc[df.index[len(df) - 1], 'value'])
            }

        return current_settings

    def recommend_node_sizes(self, node_summary):
        node_sizes = {}

        for column in node_summary:
            col_keys = node_summary[column].name
            group_key = col_keys[0]
            metric_type = col_keys[1]
            if metric_type == self.base_metric:
                node_sizes[group_key] = {
                    'size':
                    math.ceil(node_summary[column][self.stat_type] *
                              self.scaling_factor)
                }

        return node_sizes

    def recommend_container_cpu_settings(self, container_cpu_summary):
        container_cpu_settings = {}

        for column in container_cpu_summary:
            col_keys = container_cpu_summary[column].name
            group_key = col_keys[0]
            limit_value = max(
                float(self.config.get("UTILIZATION", "MIN_CPU_LIMITS")),
                math.ceil(container_cpu_summary[column][self.stat_type] *
                          self.scaling_factor * 100) / 100.0)
            container_cpu_settings[group_key] = {
                "requests":
                math.ceil(container_cpu_summary[column][self.stat_type] * 100)
                / float(100),
                "limits":
                limit_value
            }

        return container_cpu_settings

    def recommend_container_mem_settings(self, container_mem_summary):
        container_mem_settings = {}

        for column in container_mem_summary:
            col_keys = container_mem_summary[column].name
            group_key = col_keys[0]
            if group_key not in container_mem_settings:
                container_mem_settings[group_key] = {
                    'requests': 0,
                    'limits': 0
                }
            metric_type = col_keys[1]
            if metric_type == 'active':
                requests = math.ceil(
                    container_mem_summary[column][self.stat_type])
                container_mem_settings[group_key]['requests'] = requests
            else:  # metric_type = 'usage'
                limits = math.ceil(
                    container_mem_summary[column][self.stat_type] *
                    self.scaling_factor)
                container_mem_settings[group_key]['limits'] = limits

        return container_mem_settings

    def construct_analysis_results(self, resource_type, usage_summary,
                                   recommended_settings, current_settings):
        summary_stats = {}
        results = []

        for column in usage_summary:
            col_keys = usage_summary[column].name
            label_key = col_keys[0]
            metric_type = col_keys[1]
            summary_type = resource_type + "_" + metric_type
            if label_key not in summary_stats:
                summary_stats[label_key] = {
                    summary_type: usage_summary[column].to_dict()
                }
            else:
                summary_stats[label_key][summary_type] = usage_summary[
                    column].to_dict()

        for label_key in summary_stats:
            label_values = {}
            for label in label_key.split(","):
                label_pair = label.split("=")
                label_values[label_pair[0]] = label_pair[1]

            if label_key not in current_settings.keys():
                current_settings[label_key] = {'requests': 0, 'limit': 0}

            results.append({
                "label_values":
                label_values,
                "summary_stats":
                summary_stats[label_key],
                "current_settings":
                current_settings[label_key],
                "recommended_settings":
                recommended_settings[label_key]
            })

        return results

    def store_analysis_results(self, sizing_result_doc):
        sizing_result_doc["config"] = {
            "stat_type": self.stat_type,
            "scaling_factor": self.scaling_factor,
            "base_metric": self.base_metric
        }

        try:
            self.resultdb[self.results_collection].insert_one(
                sizing_result_doc)
        except Exception as e:
            self.logger.error(
                "Unable to store sizing result doc in MongoDB: %s" %
                str(sizing_result_doc))
            raise Exception("Unable to store sizing result doc in MongoDB: " +
                            str(e))