Beispiel #1
0
 def __init__(self, queue, parent_pid):
     super(PrometheusMetrics, self).__init__()
     self.q = queue
     self.daemon = True
     self.parent_pid = parent_pid
     self.current_pid = getpid()
     self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)
Beispiel #2
0
 def __init__(self,
              queue,
              parent_pid,
              skip_mini,
              worker_number,
              canary=False):
     super(Worker, self).__init__()
     # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow
     if settings.REDIS_PASSWORD:
         # @modified 20191014 - Bug #3266: py3 Redis binary objects not strings
         #                      Branch #3262: py3
         # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
         self.redis_conn = StrictRedis(
             password=settings.REDIS_PASSWORD,
             unix_socket_path=settings.REDIS_SOCKET_PATH,
             charset='utf-8',
             decode_responses=True)
     else:
         # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
         self.redis_conn = StrictRedis(
             unix_socket_path=settings.REDIS_SOCKET_PATH,
             charset='utf-8',
             decode_responses=True)
     self.q = queue
     self.parent_pid = parent_pid
     self.daemon = True
     self.canary = canary
     self.skip_mini = skip_mini
     # @added 20201017 - Feature #3788: snab_flux_load_test
     #                   Feature #3680: horizon.worker.datapoints_sent_to_redis
     # Added worker_number
     self.worker_number = worker_number
     # @added 20220216 - Feature #4446: Optimise horizon worker in_skip_list
     # Added get_redis_conn_decoded
     self.redis_conn_decoded = get_redis_conn_decoded(parent_skyline_app)
Beispiel #3
0
    def __init__(self, parent_pid):
        super(Aggregator, self).__init__()
        self.redis_conn = get_redis_conn(skyline_app)
        self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)

        self.parent_pid = parent_pid
        self.daemon = True
        self.current_pid = getpid()
Beispiel #4
0
def get_base_name_from_metric_id(current_skyline_app, metric_id):
    """
    Returns a metric id for a base_name from the
    aet.metrics_manager.ids_with_metric_names Redis hash or the DB if not found
    in Redis.

    :param current_skyline_app: the app calling the function
    :param metric_id: the metric id to lookup the base_name for.
    :type current_skyline_app: str
    :type metric_id: int
    :return: base_name
    :rtype: str

    """

    redis_key = 'aet.metrics_manager.ids_with_metric_names'
    function_str = 'functions.metrics.get_base_name_from_metric_id'

    current_skyline_app_logger = current_skyline_app + 'Log'
    current_logger = logging.getLogger(current_skyline_app_logger)

    try:
        redis_conn_decoded = get_redis_conn_decoded(current_skyline_app)
    except Exception as e:
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: %s :: get_redis_conn_decoded failed - %s' %
            (current_skyline_app, function_str, e))
        return metric_id

    base_name = None
    try:
        base_name = redis_conn_decoded.hget(redis_key, metric_id)
        # DEBUG
        current_logger.info(
            'debug :: %s :: %s :: hget(%s, %s)' %
            (current_skyline_app, function_str, redis_key, str(metric_id)))
    except Exception as err:
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: %s :: failed to get base_name for %s: %s' %
            (current_skyline_app, function_str, str(metric_id), str(err)))

    if not base_name:
        try:
            base_name = base_name_from_metric_id(current_skyline_app,
                                                 metric_id, False)
        except Exception as err:
            current_logger.error(
                'error :: %s :: %s :: base_name_from_metric_id falied to determine base_name from metric_id: %s - %s'
                %
                (current_skyline_app, function_str, str(metric_id), str(err)))

    return base_name
Beispiel #5
0
 def __init__(self, parent_pid):
     """
     Initialize Rolling
     """
     super(RollingThunder, self).__init__()
     self.redis_conn = get_redis_conn(skyline_app)
     self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)
     self.daemon = True
     self.parent_pid = parent_pid
     self.current_pid = getpid()
 def __init__(self, parent_pid):
     """
     Initialize the SNAB_flux_load_test
     """
     super(SNAB_flux_load_test, self).__init__()
     self.redis_conn = get_redis_conn(skyline_app)
     self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)
     self.daemon = True
     self.parent_pid = parent_pid
     self.current_pid = getpid()
Beispiel #7
0
 def __init__(self, parent_pid):
     """
     Initialize RelatedMetrics
     """
     super(RelatedMetrics, self).__init__()
     self.redis_conn = get_redis_conn(skyline_app)
     self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)
     self.daemon = True
     self.parent_pid = parent_pid
     self.current_pid = getpid()
Beispiel #8
0
 def __init__(self, parent_pid):
     """
     Initialize Cloudbursts
     """
     super(Cloudbursts, self).__init__()
     self.redis_conn = get_redis_conn(skyline_app)
     self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)
     self.daemon = True
     self.parent_pid = parent_pid
     self.current_pid = getpid()
Beispiel #9
0
    def __init__(self, parent_pid):
        """
        Initialize Luminosity

        Create the :obj:`redis_conn` a Redis client object
        Create the :obj:`correlations` list
        Create the :obj:`mysql_conn` MySQLConnection object
        Create the :obj:`memcache_client` a constructor that does not make a
        connection to memcached. The first call to a method on the object will
        do that.

        """
        super(Luminosity, self).__init__()
        # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow
        # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings
        #                      Branch #3262: py3
        # Use get_redis_conn and get_redis_conn_decoded to use on Redis sets when the bytes
        # types need to be decoded as utf-8 to str
        # if settings.REDIS_PASSWORD:
        #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
        # else:
        #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)

        # @added 20191030 - Bug #3266: py3 Redis binary objects not strings
        #                   Branch #3262: py3
        # Added a single functions to deal with Redis connection and the
        # charset='utf-8', decode_responses=True arguments required in py3
        self.redis_conn = get_redis_conn(skyline_app)
        self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)

        self.daemon = True
        self.parent_pid = parent_pid
        self.current_pid = getpid()
        # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage
        #                      Task #3032: Debug number of Python processes and memory use
        #                      Branch #3002: docker
        # Reduce amount of Manager instances that are used as each requires a
        # copy of entire memory to be copied into each subprocess so this
        # results in a python process per Manager instance, using as much
        # memory as the parent.  OK on a server, not so much in a container.
        # Disabled all the Manager().list() below and replaced with Redis sets
        # self.correlations = Manager().list()
        # @added 20180720 - Task #2462: Implement useful metrics for Luminosity
        # self.metrics_checked_for_correlation = Manager().list()
        # self.runtimes = Manager().list()
        self.mysql_conn = mysql.connector.connect(**config)
        if settings.MEMCACHE_ENABLED:
            self.memcache_client = pymemcache_Client(
                (settings.MEMCACHED_SERVER_IP, settings.MEMCACHED_SERVER_PORT),
                connect_timeout=0.1,
                timeout=0.2)
        else:
            self.memcache_client = None
Beispiel #10
0
 def __init__(self, parent_pid):
     super(Worker, self).__init__()
     self.parent_pid = parent_pid
     self.daemon = True
     # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings
     #                      Branch #3262: py3
     # if settings.REDIS_PASSWORD:
     #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
     # else:
     #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
     # @added 20191111 - Bug #3266: py3 Redis binary objects not strings
     #                   Branch #3262: py3
     # Added a single functions to deal with Redis connection and the
     # charset='utf-8', decode_responses=True arguments required in py3
     self.redis_conn = get_redis_conn(skyline_app)
     self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)
Beispiel #11
0
    def __init__(self, queue, parent_pid):
        super(Worker, self).__init__()
        # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings
        #                      Branch #3262: py3
        # if settings.REDIS_PASSWORD:
        #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
        # else:
        #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
        # @added 20191115 - Bug #3266: py3 Redis binary objects not strings
        #                   Branch #3262: py3
        self.redis_conn = get_redis_conn(skyline_app)
        self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)

        self.q = queue
        self.parent_pid = parent_pid
        self.daemon = True
Beispiel #12
0
def check_redis_key(current_skyline_app, redis_key, log=True):
    """
    Check a Redis key.

    :param current_skyline_app: the app calling the function
    :param redis_key: the Redis key name
    :param log: whether to log or not, optional, defaults to True
    :type current_skyline_app: str
    :type redis_key: str
    :type log: boolean
    :return: data
    :rtype: object

    """

    function_str = 'functions.redis.check_redis_key'
    data = None
    if log:
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
    else:
        current_logger = None

    redis_conn_decoded = None
    try:
        redis_conn_decoded = get_redis_conn_decoded(current_skyline_app)
    except Exception as e:
        if log:
            current_logger.error(traceback.format_exc())
            current_logger.error(
                'error :: %s :: failed to connect to Redis to get %s - %s' %
                (function_str, redis_key, e))

    if not redis_conn_decoded:
        return data

    try:
        data = redis_conn_decoded.get(redis_key)
    except Exception as e:
        if log:
            current_logger.error(traceback.format_exc())
            current_logger.error(
                'error :: %s :: failed to remove item from Redis set %s - %s' %
                (function_str, redis_key, e))
        data = None

    return data
Beispiel #13
0
def get_base_names_and_metric_ids(current_skyline_app):
    """
    Returns a dict of base_names with their metric id from the
    aet.metrics_manager.ids_with_metric_names Redis hash.

    :param current_skyline_app: the app calling the function
    :param metric_id: the metric id to lookup the base_name for.
    :type current_skyline_app: str
    :type metric_id: int
    :return: base_name
    :rtype: str

    """

    base_names_with_ids = {}

    redis_key = 'aet.metrics_manager.metric_names_with_ids'
    function_str = 'functions.metrics.get_base_name_from_metric_id'

    try:
        redis_conn_decoded = get_redis_conn_decoded(current_skyline_app)
    except Exception as e:
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: %s :: get_redis_conn_decoded failed - %s' %
            (current_skyline_app, function_str, e))
        return base_names_with_ids

    try:
        base_names_with_ids = redis_conn_decoded.hgetall(redis_key)
        if base_names_with_ids:
            # Format cast the id str as an int
            for base_name in list(base_names_with_ids.keys()):
                metric_id = int(str(base_names_with_ids[base_name]))
                base_names_with_ids[base_name] = metric_id
    except Exception as err:
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: %s :: get_redis_conn_decoded failed - %s' %
            (current_skyline_app, function_str, str(err)))

    return base_names_with_ids
Beispiel #14
0
def get_metric_id_from_base_name(current_skyline_app, base_name):
    """
    Returns a metric id for a base_name.

    :param current_skyline_app: the app calling the function
    :param base_name: the base_name of the metric to determine the latest
        anomaly for.  Can be None if metric_id is passed as a positive int
    :return: metric_id
    :rtype: int

    """

    metric_id = 0
    redis_key = 'aet.metrics_manager.metric_names_with_ids'
    function_str = 'functions.metrics.get_metric_id_from_base_name'

    current_skyline_app_logger = current_skyline_app + 'Log'
    current_logger = logging.getLogger(current_skyline_app_logger)

    try:
        redis_conn_decoded = get_redis_conn_decoded(current_skyline_app)
    except Exception as e:
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error('error :: %s :: %s :: get_redis_conn_decoded failed - %s' % (
            current_skyline_app, function_str, e))
        return metric_id

    metric_id_str = None
    try:
        metric_id_str = redis_conn_decoded.hget(redis_key, base_name)
        # DEBUG
        current_logger.info('debug :: %s :: %s :: hget(%s, %s)' % (
            current_skyline_app, function_str, redis_key, str(base_name)))
        if metric_id_str:
            metric_id = int(str(metric_id_str))
    except Exception as err:
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error('error :: %s :: %s :: failed to get metric_id for %s: %s' % (
            current_skyline_app, function_str, base_name, str(err)))

    return metric_id
Beispiel #15
0
    def __init__(self, parent_pid, skip_mini):
        super(Roomba, self).__init__()
        # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow
        # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings
        #                      Branch #3262: py3
        # if settings.REDIS_PASSWORD:
        #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
        # else:
        #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)

        # @added 20191030 - Bug #3266: py3 Redis binary objects not strings
        #                   Branch #3262: py3
        # Added a single functions to deal with Redis connection and the
        # charset='utf-8', decode_responses=True arguments required in py3
        self.redis_conn = get_redis_conn(skyline_app)
        self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)

        self.daemon = True
        self.parent_pid = parent_pid
        self.skip_mini = skip_mini
Beispiel #16
0
    def run(self):
        """
        Called when process initializes.
        """
        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        if os.path.isfile(skyline_app_logwait):
            try:
                os_remove(skyline_app_logwait)
            except OSError:
                logger.error('error - failed to remove %s, continuing' %
                             skyline_app_logwait)
                pass

        now = time()
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = time()
            else:
                now = log_wait_for + 1

        logger.info('starting %s run' % skyline_app)
        if os.path.isfile(skyline_app_loglock):
            logger.error(
                'error - bin/%s.d log management seems to have failed, continuing'
                % skyline_app)
            try:
                os_remove(skyline_app_loglock)
                logger.info('log lock file removed')
            except OSError:
                logger.error('error - failed to remove %s, continuing' %
                             skyline_app_loglock)
                pass
        else:
            logger.info('bin/%s.d log management done' % skyline_app)

        logger.info('%s :: started roomba' % skyline_app)

        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except:
                logger.error(
                    '%s :: roomba can\'t connect to redis at socket path %s' %
                    (skyline_app, settings.REDIS_SOCKET_PATH))
                sleep(10)
                # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow
                # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings
                #                      Branch #3262: py3
                if settings.REDIS_PASSWORD:
                    self.redis_conn = StrictRedis(
                        password=settings.REDIS_PASSWORD,
                        unix_socket_path=settings.REDIS_SOCKET_PATH)
                else:
                    self.redis_conn = StrictRedis(
                        unix_socket_path=settings.REDIS_SOCKET_PATH)
                # @added 20191115 - Bug #3266: py3 Redis binary objects not strings
                #                   Branch #3262: py3
                self.redis_conn = get_redis_conn(skyline_app)
                self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)

                continue

            # Spawn processes
            pids = []
            for i in range(1, settings.ROOMBA_PROCESSES + 1):
                if not self.skip_mini:
                    logger.info(
                        '%s :: starting vacuum process on mini namespace' %
                        skyline_app)
                    p = Process(target=self.vacuum,
                                args=(i, settings.MINI_NAMESPACE,
                                      settings.MINI_DURATION +
                                      settings.ROOMBA_GRACE_TIME))
                    pids.append(p)
                    p.start()

                logger.info('%s :: starting vacuum process' % skyline_app)
                p = Process(
                    target=self.vacuum,
                    args=(i, settings.FULL_NAMESPACE,
                          settings.FULL_DURATION + settings.ROOMBA_GRACE_TIME))
                pids.append(p)
                p.start()

            # Send wait signal to zombie processes
            # for p in pids:
            #     p.join()
            # deroomba - kill any lingering vacuum processes
            # Changed to manage Roomba processes as edge cases related to I/O
            # wait have been experienced that resulted in Roomba stalling so a
            # ROOMBA_TIMEOUT setting was added and here we use the pattern
            # described by http://stackoverflow.com/users/2073595/dano at
            # http://stackoverflow.com/a/26064238 to monitor and kill any
            # stalled processes rather than using p.join(TIMEOUT) - 20160505
            # @earthgecko ref 1342
            logger.info('%s :: allowing vacuum process/es %s seconds to run' %
                        (skyline_app, str(settings.ROOMBA_TIMEOUT)))
            start = time()
            while time() - start <= settings.ROOMBA_TIMEOUT:
                if any(p.is_alive() for p in pids):
                    # Just to avoid hogging the CPU
                    sleep(.1)
                else:
                    # All the processes are done, break now.
                    time_to_run = time() - start
                    logger.info('%s :: vacuum processes completed in %.2f' %
                                (skyline_app, time_to_run))
                    break
            else:
                # We only enter this if we didn't 'break' above.
                logger.info('%s :: timed out, killing all Roomba processes' %
                            (skyline_app))
                for p in pids:
                    p.terminate()
                    p.join()

            # sleeping in the main process is more CPU efficient than sleeping
            # in the vacuum def also roomba is quite CPU intensive so we only
            # what to run roomba once every minute
            process_runtime = time() - now
            roomba_optimum_run_duration = 60
            if process_runtime < roomba_optimum_run_duration:
                sleep_for = (roomba_optimum_run_duration - process_runtime)
                logger.info('%s :: sleeping %.2f for due to low run time' %
                            (skyline_app, sleep_for))
                sleep(sleep_for)
Beispiel #17
0
    def run(self):
        """
        Called when the process intializes.
        """

        logger.info('aggregator :: starting aggregator')

        # Determine a primary aggregator
        aggregator_pid = getpid()
        main_process_pid = 0
        try:
            main_process_pid = int(
                self.redis_conn_decoded.get('flux.main_process_pid'))
            if main_process_pid:
                logger.info(
                    'aggregator :: main_process_pid found in Redis key - %s' %
                    str(main_process_pid))
        except:
            main_process_pid = 0
        if not main_process_pid:
            logger.error(
                'error :: aggregator :: no main_process_pid known, exiting')
            sys.exit(1)

        primary_aggregator_key = 'flux.primary_aggregator_pid.%s' % str(
            main_process_pid)
        logger.info(
            'aggregator :: starting primary_aggregator election using primary_aggregator_key: %s'
            % primary_aggregator_key)
        sleep_for = random.uniform(0.1, 1.5)
        logger.info(
            'aggregator :: starting primary_aggregator election - sleeping for %s'
            % str(sleep_for))
        sleep(sleep_for)
        primary_aggregator_pid = 0
        try:
            primary_aggregator_pid = int(
                self.redis_conn_decoded.get(primary_aggregator_key))
            if primary_aggregator_pid:
                logger.info(
                    'aggregator :: primary_aggregator_pid found in Redis key - %s'
                    % str(primary_aggregator_pid))
        except:
            primary_aggregator_pid = 0
        if not primary_aggregator_pid:
            try:
                self.redis_conn.setex(primary_aggregator_key, 300,
                                      aggregator_pid)
                primary_aggregator_pid = int(
                    self.redis_conn_decoded.get(primary_aggregator_key))
                logger.info(
                    'aggregator :: set self pid to primary_aggregator - %s' %
                    str(primary_aggregator_pid))
            except:
                primary_aggregator_pid = 0
        primary_aggregator = False
        if primary_aggregator_pid == aggregator_pid:
            primary_aggregator = True
        logger.info(
            'aggregator :: primary_aggregator_pid is set to %s, primary_aggregator: %s'
            % (str(primary_aggregator_pid), str(primary_aggregator)))

        last_flush = int(time()) - 59
        remove_from_flux_queue_redis_set = []

        # Populate API keys and tokens in memcache
        # python-2.x and python3.x handle while 1 and while True differently
        # while 1:
        running = True
        while running:
            # Make sure Redis is up
            redis_up = False
            while not redis_up:
                try:
                    redis_up = self.redis_conn.ping()
                except:
                    logger.error(
                        'aggregator :: cannot connect to redis at socket path %s'
                        % (settings.REDIS_SOCKET_PATH))
                    sleep(2)
                    try:
                        self.redis_conn = get_redis_conn(skyline_app)
                    except Exception as e:
                        logger.error(
                            'error :: aggregator :: could not get_redis_conn - %s'
                            % str(e))
                    try:
                        self.redis_conn_decoded = get_redis_conn_decoded(
                            skyline_app)
                    except Exception as e:
                        logger.error(
                            'error :: aggregator :: could not get_redis_conn_decoded - %s'
                            % str(e))

            try:
                time_now = int(time())
                while (time_now - last_flush) <= 59:
                    sleep(1)
                    remove_from_flux_queue_redis_set = []
                    time_now = int(time())

                primary_aggregator_pid = 0
                try:
                    primary_aggregator_pid = int(
                        self.redis_conn_decoded.get(primary_aggregator_key))
                    if primary_aggregator_pid:
                        logger.info(
                            'aggregator :: primary_aggregator_pid found in Redis key - %s'
                            % str(primary_aggregator_pid))
                except:
                    primary_aggregator_pid = 0
                if not primary_aggregator_pid:
                    try:
                        self.redis_conn.setex(primary_aggregator_key, 300,
                                              aggregator_pid)
                        primary_aggregator_pid = int(
                            self.redis_conn_decoded.get(
                                primary_aggregator_key))
                        logger.info(
                            'aggregator :: set self pid to primary_aggregator - %s'
                            % str(primary_aggregator_pid))
                    except:
                        primary_aggregator_pid = 0
                primary_aggregator = False
                if primary_aggregator_pid == aggregator_pid:
                    primary_aggregator = True
                logger.info(
                    'aggregator :: primary_aggregator_pid is set to %s, primary_aggregator: %s'
                    % (str(primary_aggregator_pid), str(primary_aggregator)))

                flux_aggregator_queue = []
                if primary_aggregator:
                    logger.info('aggregator :: checking for data to aggregate')
                    try:
                        flux_aggregator_queue = self.redis_conn_decoded.smembers(
                            'flux.aggregator.queue')
                        logger.info(
                            'aggregator :: %s entries in flux.aggregator.queue to process'
                            % str(len(flux_aggregator_queue)))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: could not get the flux.aggregator.queue set from Redis'
                        )
                else:
                    logger.info(
                        'aggregator :: not primary, in standby to take over should the primary_aggregator fail'
                    )

                flux_aggregator_queue_items = []
                all_metrics = []
                if flux_aggregator_queue:
                    for flux_aggregator_queue_item_str in flux_aggregator_queue:
                        try:
                            flux_aggregator_queue_item = literal_eval(
                                flux_aggregator_queue_item_str)
                            all_metrics.append(flux_aggregator_queue_item[0])
                            flux_aggregator_queue_items.append([
                                flux_aggregator_queue_item,
                                flux_aggregator_queue_item_str
                            ])
                            # self.redis_conn.srem('flux.aggregator.queue', flux_aggregator_queue_item_str)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: failed to evaluate item from flux.aggregator.queue Redis set'
                            )
                metrics = list(set(all_metrics))
                for metric in metrics:
                    last_metric_flush = last_flush
                    last_metric_flush_str = None
                    try:
                        last_metric_flush_str = self.redis_conn_decoded.hget(
                            'flux.aggregate_metrics.last_flush', metric)
                        # Handle new metric without throwing an error if they do
                        # not have an entry in the hash
                        if last_metric_flush_str:
                            last_metric_flush = int(last_metric_flush_str)
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: failed convert last_metric_flush_str value to an int from flux.aggregate_metrics.last_flush Redis hash for %s'
                            % metric)
                    if not last_metric_flush:
                        # Handle new metric without throwing an error if they do
                        # not have an entry in the hash
                        logger.info(
                            'aggregator :: probable new metric - no last_metric_flush found in flux.aggregate_metrics.last_flush Redis hash for %s using last_flush'
                            % metric)
                        last_metric_flush = last_flush
                    metric_aggregation_settings = {}
                    try:
                        metric_aggregation_settings_str = self.redis_conn_decoded.hget(
                            'metrics_manager.flux.aggregate_namespaces.settings',
                            metric)
                        # @modified 20210718
                        if metric_aggregation_settings_str:
                            metric_aggregation_settings = literal_eval(
                                metric_aggregation_settings_str)
                        else:
                            metric_aggregation_settings = {}
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: failed to determine aggregation_settings from metrics_manager.flux.aggregate_namespaces.settings Redis hash for %s'
                            % metric)

                    # @added 20210718
                    # Handle newly added metrics that have not been added to
                    # metrics_manager.flux.aggregate_namespaces.settings due to
                    # to the chicken or the egg problem
                    if not metric_aggregation_settings:
                        logger.info(
                            'aggregator :: probable new metric - %s not found in metrics_manager.flux.aggregate_namespaces.settings Redis hash'
                            % metric)
                        aggregate_namespaces = list(
                            settings.FLUX_AGGREGATE_NAMESPACES.keys())
                        pattern_match, metric_matched_by = matched_or_regexed_in_list(
                            'flux', metric, aggregate_namespaces)
                        if pattern_match:
                            matched_namespace = metric_matched_by[
                                'matched_namespace']
                            metric_aggregation_settings = settings.FLUX_AGGREGATE_NAMESPACES[
                                matched_namespace]
                            logger.info(
                                'aggregator :: new metric - %s detemined metric_aggregation_settings from FLUX_AGGREGATE_NAMESPACES - %s'
                                % (metric, str(metric_aggregation_settings)))
                        else:
                            logger.error(
                                'error :: aggregator :: new metric - %s could not detemine metric_aggregation_settings from FLUX_AGGREGATE_NAMESPACES'
                                % (metric))

                    interval = 60
                    try:
                        interval = int(metric_aggregation_settings['interval'])
                    except:
                        # logger.error(traceback.format_exc())
                        logger.error(
                            'error :: failed to get interval from metric_aggregation_settings for %s, setting to default 60'
                            % metric)
                        interval = 60
                    if (time_now - last_metric_flush) < interval:
                        continue
                    metric_values = []
                    for flux_aggregator_queue_item in flux_aggregator_queue_items:
                        if flux_aggregator_queue_item[0][0] != metric:
                            continue
                        # Discard any values older than the last metric flush
                        if int(flux_aggregator_queue_item[0]
                               [2]) > last_metric_flush:
                            metric_values.append(
                                flux_aggregator_queue_item[0][1])
                        try:
                            self.redis_conn.srem('flux.aggregator.queue',
                                                 flux_aggregator_queue_item[1])
                            remove_from_flux_queue_redis_set.append(
                                flux_aggregator_queue_item[1])
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: failed to remove item from flux.aggregator.queue Redis set - %s'
                                % str(flux_aggregator_queue_item[1]))
                    if not metric_aggregation_settings:
                        logger.error(
                            'error :: no aggregation settings known for %s, discarding data'
                            % metric)
                        continue
                    if metric_values:
                        methods = []
                        try:
                            methods = metric_aggregation_settings['method']
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: failed to determine aggregation methods from metric_aggregation_settings - %s'
                                % str(metric_aggregation_settings))
                            methods = []
                        for method in methods:
                            try:
                                metric_namespace = metric
                                if metric_aggregation_settings[
                                        'method_suffix']:
                                    metric_namespace = '%s.%s' % (metric,
                                                                  method)
                                else:
                                    # @added 20220126 - Feature #4400: flux - quota
                                    # If method_suffix is not set but multiple
                                    # methods are being used, method_suffix
                                    # must be applied, otherwise the metric will
                                    # have all the method values submitted to a
                                    # single metric name.
                                    if len(methods) > 1:
                                        metric_namespace = '%s.%s' % (metric,
                                                                      method)
                                aggregate_value = None
                                if method == 'avg':
                                    if len(metric_values) > 1:
                                        aggregate_value = sum(
                                            metric_values) / len(metric_values)
                                    else:
                                        aggregate_value = metric_values[0]
                                if method == 'sum':
                                    aggregate_value = sum(metric_values)
                                if method == 'max':
                                    aggregate_value = max(metric_values)
                                if method == 'min':
                                    aggregate_value = min(metric_values)
                                if aggregate_value is not None:
                                    try:
                                        backfill = False
                                        metric_data = [
                                            metric_namespace, aggregate_value,
                                            (time_now - interval), backfill
                                        ]
                                        flux.httpMetricDataQueue.put(
                                            metric_data, block=False)
                                        logger.info('aggregator :: added %s' %
                                                    (str(metric_data)))
                                        try:
                                            self.redis_conn.hset(
                                                'flux.aggregate_metrics.last_flush',
                                                metric, time_now)
                                        except:
                                            logger.error(
                                                traceback.format_exc())
                                            logger.error(
                                                'error :: aggregator :: failed to set last metric flush time in Redis hash flux.aggregate_metrics.last_flush'
                                            )
                                    except:
                                        logger.error(traceback.format_exc())
                                        logger.error(
                                            'error :: aggregator :: failed to add aggregator data to flux.httpMetricDataQueue - %s'
                                            % str(metric_data))
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: aggregator :: failed to aggregate metric_values by a method for %s'
                                    % str(metric))

                last_flush = time_now

                # flux_zero_fill_metrics = list(self.redis_conn_decoded.smembers('flux.zero_fill_metrics'))

                if FLUX_PERSIST_QUEUE:
                    redis_set_size = 0
                    try:
                        redis_set_size = self.redis_conn.scard('flux.queue')
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: aggregator :: failed to determine size of flux.queue Redis set'
                        )
                    logger.info(
                        'aggregator :: flux.queue Redis set size of %s before removal of %s items'
                        % (str(redis_set_size),
                           str(len(remove_from_flux_queue_redis_set))))
                    if remove_from_flux_queue_redis_set:
                        try:
                            self.redis_conn.srem(
                                'flux.queue',
                                *set(remove_from_flux_queue_redis_set))
                            remove_from_flux_queue_redis_set = []
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: aggregator :: failed to remove multiple items from flux.queue Redis set'
                            )
                        try:
                            redis_set_size = self.redis_conn.scard(
                                'flux.queue')
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: aggregator :: failed to determine size of flux.queue Redis set'
                            )
                        logger.info(
                            'aggregator :: flux.queue Redis set size of %s after the removal of items'
                            % (str(redis_set_size)))
                        remove_from_flux_queue_redis_set = []

                if primary_aggregator:
                    try:
                        self.redis_conn.setex(primary_aggregator_key, 300,
                                              aggregator_pid)
                        primary_aggregator_pid = int(
                            self.redis_conn_decoded.get(
                                primary_aggregator_key))
                        logger.info(
                            'aggregator :: set self pid to primary_aggregator - %s'
                            % str(primary_aggregator_pid))
                        logger.info(
                            'aggregator :: set Redis primary_aggregator_key key to self pid to primary_aggregator - %s'
                            % str(primary_aggregator_pid))
                    except Exception as e:
                        logger.error(
                            'error :: aggregator :: failed to set Redis primary_aggregator_key key to self pid - %s'
                            % (str(e)))

            except NotImplementedError:
                pass
            except KeyboardInterrupt:
                logger.info(
                    'aggregator :: server has been issued a user signal to terminate - KeyboardInterrupt'
                )
            except SystemExit:
                logger.info(
                    'aggregator :: server was interrupted - SystemExit')
            except Exception as e:
                logger.error(traceback.format_exc())
                logger.error('error :: aggregator :: %s' % (str(e)))
Beispiel #18
0
def adtk_level_shift(current_skyline_app, parent_pid, timeseries, algorithm_parameters):
    """
    A timeseries is anomalous if a level shift occurs in a 5 window period bound
    by a factor of 9 of the normal range based on historical interquartile range.

    :param current_skyline_app: the Skyline app executing the algorithm.  This
        will be passed to the algorithm by Skyline.  This is **required** for
        error handling and logging.  You do not have to worry about handling the
        argument in the scope of the custom algorithm itself,  but the algorithm
        must accept it as the first agrument.
    :param parent_pid: the parent pid which is executing the algorithm, this is
        **required** for error handling and logging.  You do not have to worry
        about handling this argument in the scope of algorithm, but the
        algorithm must accept it as the second argument.
    :param timeseries: the time series as a list e.g. ``[[1578916800.0, 29.0],
        [1578920400.0, 55.0], ... [1580353200.0, 55.0]]``
    :param algorithm_parameters: a dictionary of any required parameters for the
        custom_algorithm and algorithm itself.  For the matrixprofile custom
        algorithm the following parameters are required, example:
        ``algorithm_parameters={
            'c': 9.0,
            'run_every': 5,
            'side': 'both',
            'window': 5
        }``
    :type current_skyline_app: str
    :type parent_pid: int
    :type timeseries: list
    :type algorithm_parameters: dict
    :return: True, False or Non
    :rtype: boolean

    Performance is of paramount importance in Skyline, especially in terms of
    computational complexity, along with execution time and CPU usage. The
    adtk LevelShiftAD algortihm is not O(n) and it is not fast either, not when
    compared to the normal three-sigma triggered algorithms.  However it is
    useful if you care about detecting all level shifts.  The normal three-sigma
    triggered algorithms do not always detect a level shift, especially if the
    level shift does not breach the three-sigma limits.  Therefore you may find
    over time that you encounter alerts that contain level shifts that you
    thought should have been detected.  On these types of metrics and events,
    the adtk LevelShiftAD algortihm can be implemented to detect and alert on
    these.  It is not recommended to run on all your metrics as it would
    immediately triple the analyzer runtime every if only run every 5 windows/
    minutes.

    Due to the computational complexity and long run time of the adtk
    LevelShiftAD algorithm on the size of timeseries data used by Skyline, if
    you consider the following timings of all three-sigma triggered algorithms
    and compare them to the to the adtk_level_shift results in the last 2 rows
    of the below log, it is clear that the running adtk_level_shift on all
    metrics is probably not desirable, even if it is possible to do, it is very
    noisy.

    2021-03-06 10:46:38 :: 1582754 :: algorithm run count - histogram_bins run 567 times
    2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - histogram_bins has 567 timings
    2021-03-06 10:46:38 :: 1582754 :: algorithm timing - histogram_bins - total: 1.051136 - median: 0.001430
    2021-03-06 10:46:38 :: 1582754 :: algorithm run count - first_hour_average run 567 times
    2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - first_hour_average has 567 timings
    2021-03-06 10:46:38 :: 1582754 :: algorithm timing - first_hour_average - total: 1.322432 - median: 0.001835
    2021-03-06 10:46:38 :: 1582754 :: algorithm run count - stddev_from_average run 567 times
    2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - stddev_from_average has 567 timings
    2021-03-06 10:46:38 :: 1582754 :: algorithm timing - stddev_from_average - total: 1.097290 - median: 0.001641
    2021-03-06 10:46:38 :: 1582754 :: algorithm run count - grubbs run 567 times
    2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - grubbs has 567 timings
    2021-03-06 10:46:38 :: 1582754 :: algorithm timing - grubbs - total: 1.742929 - median: 0.002438
    2021-03-06 10:46:38 :: 1582754 :: algorithm run count - ks_test run 147 times
    2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - ks_test has 147 timings
    2021-03-06 10:46:38 :: 1582754 :: algorithm timing - ks_test - total: 0.127648 - median: 0.000529
    2021-03-06 10:46:38 :: 1582754 :: algorithm run count - mean_subtraction_cumulation run 40 times
    2021-03-06 10:46:38 :: 1582754 :: algorithm timings count - mean_subtraction_cumulation has 40 timings
    2021-03-06 10:46:38 :: 1582754 :: algorithm timing - mean_subtraction_cumulation - total: 0.152515 - median: 0.003152
    2021-03-06 10:46:39 :: 1582754 :: algorithm run count - median_absolute_deviation run 35 times
    2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - median_absolute_deviation has 35 timings
    2021-03-06 10:46:39 :: 1582754 :: algorithm timing - median_absolute_deviation - total: 0.143770 - median: 0.003248
    2021-03-06 10:46:39 :: 1582754 :: algorithm run count - stddev_from_moving_average run 30 times
    2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - stddev_from_moving_average has 30 timings
    2021-03-06 10:46:39 :: 1582754 :: algorithm timing - stddev_from_moving_average - total: 0.125173 - median: 0.003092
    2021-03-06 10:46:39 :: 1582754 :: algorithm run count - least_squares run 16 times
    2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - least_squares has 16 timings
    2021-03-06 10:46:39 :: 1582754 :: algorithm timing - least_squares - total: 0.089108 - median: 0.005538
    2021-03-06 10:46:39 :: 1582754 :: algorithm run count - abs_stddev_from_median run 1 times
    2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - abs_stddev_from_median has 1 timings
    2021-03-06 10:46:39 :: 1582754 :: algorithm timing - abs_stddev_from_median - total: 0.036797 - median: 0.036797
    2021-03-06 10:46:39 :: 1582754 :: algorithm run count - adtk_level_shift run 271 times
    2021-03-06 10:46:39 :: 1582754 :: algorithm timings count - adtk_level_shift has 271 timings
    2021-03-06 10:46:39 :: 1582754 :: algorithm timing - adtk_level_shift - total: 13.729565 - median: 0.035791
    ...
    ...
    2021-03-06 10:46:39 :: 1582754 :: seconds to run     :: 27.93  # THE TOTAL ANALYZER RUNTIME

    Therefore the analysis methodology implemented for the adtk_level_shift
    custom_algorithm is as folows:

    - When new metrics are added either to the configuration or by actual new
    metrics coming online that match the ``algorithm_parameters['namespace']``,
    Skyline implements sharding on new metrics into time slots to prevent a
    thundering herd situation from developing.  A newly added metrics will
    eventually be assigned into a time shard and be added and the last analysed
    timestamp will be added to the ``analyzer.last.adtk_level_shift`` Redis hash
    key to determine the next scheduled run with
    ``algorithm_parameters['namespace']``

    - A ``run_every`` parameter is implemented so that the algorithm can be
    configured to run on a metric once every ``run_every`` minutes.  The default
    is to run it every 5 minutes using window 5 (rolling) and trigger as
    anomalous if the algorithm labels any of the last 5 datapoints as anomalous.
    This means that there could be up to a 5 minute delay on an alert on the
    60 second, 168 SECOND_ORDER_RESOLUTION_HOURS metrics in the example, but a
    ``c=9.0`` level shift would be detected and would be alerted on (if both
    analyzer and mirage triggered on it).  This periodic running of the
    algorithm is a tradeoff so that the adtk_level_shift load and runtime can be
    spread over ``run_every`` minutes.

    - The algorithm is not run against metrics that are sparsely populated.
    When the algorithm is run on sparsely populated metrics it results in lots
    of false positives and noise.

    The Skyline CUSTOM_ALGORITHMS implementation of the adtk LevelShiftAD
    algorithm is configured as the example shown below.  However please note
    that the algorithm_parameters shown in this example configuration are
    suitiable for metrics that have a 60 second relation and have a
    :mod:`settings.ALERTS` Mirage SECOND_ORDER_RESOLUTION_HOURS of 168 (7 days).
    For metrics with a different resolution/frequency may require different
    values appropriate for metric resolution.

    :
    Example CUSTOM_ALGORITHMS configuration:

    'adtk_level_shift': {
        'namespaces': [
            'skyline.analyzer.run_time', 'skyline.analyzer.total_metrics',
            'skyline.analyzer.exceptions'
        ],
        'algorithm_source': '/opt/skyline/github/skyline/skyline/custom_algorithms/adtk_level_shift.py',
        'algorithm_parameters': {'c': 9.0, 'run_every': 5, 'side': 'both', 'window': 5},
        'max_execution_time': 0.5,
        'consensus': 1,
        'algorithms_allowed_in_consensus': ['adtk_level_shift'],
        'run_3sigma_algorithms': True,
        'run_before_3sigma': True,
        'run_only_if_consensus': False,
        'use_with': ["analyzer", "mirage"],
        'debug_logging': False,
    },

    """

    # You MUST define the algorithm_name
    algorithm_name = 'adtk_level_shift'

    # Define the default state of None and None, anomalous does not default to
    # False as that is not correct, False is only correct if the algorithm
    # determines the data point is not anomalous.  The same is true for the
    # anomalyScore.
    anomalous = None
    anomalyScore = None

    # @aded 20210308 - Feature #3978: luminosity - classify_metrics
    #                  Feature #3642: Anomaly type classification
    return_anomalies = False
    anomalies = []
    realtime_analysis = True

    current_logger = None

    # If you wanted to log, you can but this should only be done during
    # testing and development
    def get_log(current_skyline_app):
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
        return current_logger

    start = timer()

    # Use the algorithm_parameters to determine the sample_period
    debug_logging = None
    try:
        debug_logging = algorithm_parameters['debug_logging']
    except:
        debug_logging = False
    if debug_logging:
        try:
            current_logger = get_log(current_skyline_app)
            current_logger.debug('debug :: %s :: debug_logging enabled with algorithm_parameters - %s' % (
                algorithm_name, str(algorithm_parameters)))
        except:
            # This except pattern MUST be used in ALL custom algortihms to
            # facilitate the traceback from any errors.  The algorithm we want to
            # run super fast and without spamming the log with lots of errors.
            # But we do not want the function returning and not reporting
            # anything to the log, so the pythonic except is used to "sample" any
            # algorithm errors to a tmp file and report once per run rather than
            # spewing tons of errors into the log e.g. analyzer.log
            record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc())
            # Return None and None as the algorithm could not determine True or False
            return (False, None)

    # Allow the LevelShiftAD window parameter to be passed in the
    # algorithm_parameters
    window = 5
    try:
        window = algorithm_parameters['window']
    except:
        pass

    # Allow the LevelShiftAD c parameter to be passed in the
    # algorithm_parameters
    c = 9.0
    try:
        c = algorithm_parameters['c']
    except:
        pass

    run_every = window
    try:
        run_every = algorithm_parameters['run_every']
    except:
        pass

    side = 'both'
    try:
        side = algorithm_parameters['side']
    except:
        pass

    if debug_logging:
        current_logger.debug('debug :: algorithm_parameters :: %s' % (
            str(algorithm_parameters)))

    # @added 20210308 - Feature #3978: luminosity - classify_metrics
    #                   Feature #3642: Anomaly type classification
    try:
        return_anomalies = algorithm_parameters['return_anomalies']
    except:
        return_anomalies = False
    try:
        realtime_analysis = algorithm_parameters['realtime_analysis']
    except:
        realtime_analysis = True

    # @added 20210316 - Feature #3978: luminosity - classify_metrics
    #                   Feature #3642: Anomaly type classification
    save_plots_to = False
    try:
        save_plots_to = algorithm_parameters['save_plots_to']
    except:
        pass

    # @added 20210323 - Feature #3978: luminosity - classify_metrics
    #                   Feature #3642: Anomaly type classification
    save_plots_to_absolute_dir = False
    try:
        save_plots_to_absolute_dir = algorithm_parameters['save_plots_to_absolute_dir']
    except:
        pass
    filename_prefix = False
    try:
        filename_prefix = algorithm_parameters['filename_prefix']
    except:
        pass

    # @added 20210318 - Feature #3978: luminosity - classify_metrics
    #                   Feature #3642: Anomaly type classification
    run_PersistAD = False
    try:
        run_PersistAD = algorithm_parameters['run_PersistAD']
    except:
        pass

    if debug_logging:
        current_logger.debug('debug :: algorithm_parameters :: %s' % (
            str(algorithm_parameters)))

    try:
        base_name = algorithm_parameters['base_name']
    except:
        # This except pattern MUST be used in ALL custom algortihms to
        # facilitate the traceback from any errors.  The algorithm we want to
        # run super fast and without spamming the log with lots of errors.
        # But we do not want the function returning and not reporting
        # anything to the log, so the pythonic except is used to "sample" any
        # algorithm errors to a tmp file and report once per run rather than
        # spewing tons of errors into the log e.g. analyzer.log
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc())
        # Return None and None as the algorithm could not determine True or False
        if return_anomalies:
            return (False, None, anomalies)
        else:
            return (False, None)
    if debug_logging:
        current_logger.debug('debug :: %s :: base_name - %s' % (
            algorithm_name, str(base_name)))

    # Due to the load and runtime of LevelShiftAD it is only run in analyzer
    # periodically
    if current_skyline_app == 'analyzer':
        redis_conn_decoded = get_redis_conn_decoded(current_skyline_app)
        last_hash_key = 'analyzer.last.%s' % algorithm_name
        last_check = None
        try:
            raw_last_check = redis_conn_decoded.hget(last_hash_key, base_name)
            last_check = int(raw_last_check)
        except:
            last_check = None
        last_window_timestamps = [int(item[0]) for item in timeseries[-run_every:]]
        if last_check in last_window_timestamps:
            if debug_logging:
                current_logger.debug('debug :: %s :: run_every period is not over yet, skipping base_name - %s' % (
                    algorithm_name, str(base_name)))
            if return_anomalies:
                return (False, None, anomalies)
            else:
                return (False, None)

        # If there is no last timestamp, shard the metric, it will eventually
        # be added.
        if not last_check:
            now = datetime.datetime.now()
            now_seconds = int(now.second)
            if now_seconds == 0:
                now_seconds = 1
            period_seconds = int(60 / run_every)
            shard = int(period_seconds)
            last_shard = 60
            shard = int(period_seconds)
            shards = [shard]
            while shard < last_shard:
                shard = shard + period_seconds
                shards.append((shard))
            shard_value = round(now_seconds / shards[0]) * shards[0]
            if shard_value <= shards[0]:
                shard_value = shards[0]
            metric_as_bytes = str(base_name).encode()
            value = zlib.adler32(metric_as_bytes)
            shard_index = [(index + 1) for index, s_value in enumerate(shards) if s_value == shard_value][0]
            modulo_result = value % shard_index
            if modulo_result == 0:
                if debug_logging:
                    current_logger.debug('debug :: %s :: skipping as not sharded into this run - %s' % (
                        algorithm_name, str(base_name)))
            if return_anomalies:
                return (False, None, anomalies)
            else:
                return (False, None)
        if debug_logging:
            current_logger.debug('debug :: %s :: analysing %s' % (
                algorithm_name, str(base_name)))

        try:
            int_metric_timestamp = int(timeseries[-1][0])
        except:
            int_metric_timestamp = 0
        if int_metric_timestamp:
            try:
                redis_conn_decoded.hset(
                    last_hash_key, base_name,
                    int_metric_timestamp)
            except:
                pass

    # ALWAYS WRAP YOUR ALGORITHM IN try and the BELOW except
    try:
        start_preprocessing = timer()

        # INFO: Sorting time series of 10079 data points took 0.002215 seconds
        timeseries = sorted(timeseries, key=lambda x: x[0])
        if debug_logging:
            current_logger.debug('debug :: %s :: time series of length - %s' % (
                algorithm_name, str(len(timeseries))))

        # Testing the data to ensure it meets minimum requirements, in the case
        # of Skyline's use of the LevelShiftAD algorithm this means that:
        # - the time series must have at least 75% of its full_duration
        # - the time series must have at least 99% of the data points for the
        #   in the sample being analysed.
        do_not_use_sparse_data = False
        if current_skyline_app == 'analyzer':
            do_not_use_sparse_data = True

        # @added 20210305 - Feature #3970: custom_algorithm - adtk_level_shift
        #                   Task #3664:: POC with adtk
        # With mirage also do not run LevelShiftAD on sparsely populated data
        if current_skyline_app == 'mirage':
            do_not_use_sparse_data = True

        # @aded 20210309 - Feature #3978: luminosity - classify_metrics
        #                  Feature #3642: Anomaly type classification
        if current_skyline_app == 'luminosity':
            do_not_use_sparse_data = True

        if do_not_use_sparse_data:

            total_period = 0
            total_datapoints = 0
            try:
                start_timestamp = int(timeseries[0][0])
                end_timestamp = int(timeseries[-1][0])
                total_period = end_timestamp - start_timestamp
                total_datapoints = len(timeseries)
            except SystemExit as e:
                if debug_logging:
                    current_logger.debug('debug_logging :: %s :: SystemExit called, exiting - %s' % (
                        algorithm_name, e))
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                else:
                    return (anomalous, anomalyScore)
            except:
                traceback_msg = traceback.format_exc()
                record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
                if debug_logging:
                    current_logger.error(traceback_msg)
                    current_logger.error('error :: debug_logging :: %s :: failed to determine total_period and total_datapoints' % (
                        algorithm_name))
                timeseries = []
            if not timeseries:
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                else:
                    return (anomalous, anomalyScore)

            if current_skyline_app == 'analyzer':
                # Default for analyzer at required period to 18 hours
                period_required = int(FULL_DURATION * 0.75)
            else:
                # Determine from timeseries
                if total_period < FULL_DURATION:
                    period_required = int(FULL_DURATION * 0.75)
                else:
                    period_required = int(total_period * 0.75)

            # If the time series does not have 75% of its full_duration it does not
            # have sufficient data to sample
            try:
                if total_period < period_required:
                    if debug_logging:
                        current_logger.debug('debug :: %s :: time series does not have sufficient data' % (
                            algorithm_name))
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    else:
                        return (anomalous, anomalyScore)
            except SystemExit as e:
                if debug_logging:
                    current_logger.debug('debug_logging :: %s :: SystemExit called, exiting - %s' % (
                        algorithm_name, e))
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                else:
                    return (anomalous, anomalyScore)
            except:
                traceback_msg = traceback.format_exc()
                record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
                if debug_logging:
                    current_logger.error(traceback_msg)
                    current_logger.error('error :: debug_logging :: %s :: falied to determine if time series has sufficient data' % (
                        algorithm_name))
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                else:
                    return (anomalous, anomalyScore)

            # If the time series does not have 75% of its full_duration data points
            # it does not have sufficient data to sample

            # Determine resolution from the last 30 data points
            # INFO took 0.002060 seconds
            resolution_timestamps = []
            metric_resolution = False
            for metric_datapoint in timeseries[-30:]:
                timestamp = int(metric_datapoint[0])
                resolution_timestamps.append(timestamp)
            timestamp_resolutions = []
            if resolution_timestamps:
                last_timestamp = None
                for timestamp in resolution_timestamps:
                    if last_timestamp:
                        resolution = timestamp - last_timestamp
                        timestamp_resolutions.append(resolution)
                        last_timestamp = timestamp
                    else:
                        last_timestamp = timestamp
                try:
                    del resolution_timestamps
                except:
                    pass
            if timestamp_resolutions:
                try:
                    timestamp_resolutions_count = Counter(timestamp_resolutions)
                    ordered_timestamp_resolutions_count = timestamp_resolutions_count.most_common()
                    metric_resolution = int(ordered_timestamp_resolutions_count[0][0])
                except SystemExit as e:
                    if debug_logging:
                        current_logger.debug('debug_logging :: %s :: SystemExit called, exiting - %s' % (
                            algorithm_name, e))
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    else:
                        return (anomalous, anomalyScore)
                except:
                    traceback_msg = traceback.format_exc()
                    record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
                    if debug_logging:
                        current_logger.error(traceback_msg)
                        current_logger.error('error :: debug_logging :: %s :: failed to determine if time series has sufficient data' % (
                            algorithm_name))
                try:
                    del timestamp_resolutions
                except:
                    pass
            minimum_datapoints = None
            if metric_resolution:
                minimum_datapoints = int(period_required / metric_resolution)
            if minimum_datapoints:
                if total_datapoints < minimum_datapoints:
                    if debug_logging:
                        current_logger.debug('debug :: %s :: time series does not have sufficient data, minimum_datapoints required is %s and time series has %s' % (
                            algorithm_name, str(minimum_datapoints),
                            str(total_datapoints)))
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    else:
                        return (anomalous, anomalyScore)

            # Is the time series fully populated?
            # full_duration_datapoints = int(full_duration / metric_resolution)
            total_period_datapoints = int(total_period / metric_resolution)
            # minimum_percentage_sparsity = 95
            minimum_percentage_sparsity = 90
            sparsity = int(total_datapoints / (total_period_datapoints / 100))
            if sparsity < minimum_percentage_sparsity:
                if debug_logging:
                    current_logger.debug('debug :: %s :: time series does not have sufficient data, minimum_percentage_sparsity required is %s and time series has %s' % (
                        algorithm_name, str(minimum_percentage_sparsity),
                        str(sparsity)))
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                else:
                    return (anomalous, anomalyScore)
            if len(set(item[1] for item in timeseries)) == 1:
                if debug_logging:
                    current_logger.debug('debug :: %s :: time series does not have sufficient variability, all the values are the same' % algorithm_name)
                anomalous = False
                anomalyScore = 0.0
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                else:
                    return (anomalous, anomalyScore)

        end_preprocessing = timer()
        preprocessing_runtime = end_preprocessing - start_preprocessing
        if debug_logging:
            current_logger.debug('debug :: %s :: preprocessing took %.6f seconds' % (
                algorithm_name, preprocessing_runtime))

        if not timeseries:
            if debug_logging:
                current_logger.debug('debug :: %s :: LevelShiftAD not run as no data' % (
                    algorithm_name))
            anomalies = []
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            else:
                return (anomalous, anomalyScore)
        else:
            if debug_logging:
                current_logger.debug('debug :: %s :: timeseries length: %s' % (
                    algorithm_name, str(len(timeseries))))

        if len(timeseries) < 100:
            if debug_logging:
                current_logger.debug('debug :: %s :: time series does not have sufficient data' % (
                    algorithm_name))
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            else:
                return (anomalous, anomalyScore)

        start_analysis = timer()
        try:
            df = pd.DataFrame(timeseries, columns=['date', 'value'])
            df['date'] = pd.to_datetime(df['date'], unit='s')
            datetime_index = pd.DatetimeIndex(df['date'].values)
            df = df.set_index(datetime_index)
            df.drop('date', axis=1, inplace=True)
            s = validate_series(df)
            level_shift_ad = LevelShiftAD(c=c, side=side, window=window)
            anomaly_df = level_shift_ad.fit_detect(s)
            anomalies = anomaly_df.loc[anomaly_df['value'] > 0]
            anomalous = False
            if len(anomalies) > 0:
                anomaly_timestamps = list(anomalies.index.astype(np.int64) // 10**9)
                if realtime_analysis:
                    last_window_timestamps = [int(item[0]) for item in timeseries[-window:]]
                    # if timeseries[-1][0] in anomaly_timestamps:
                    for timestamp in last_window_timestamps:
                        if timestamp in anomaly_timestamps:
                            anomalous = True
                            break
                else:
                    anomalous = True
                    # Convert anomalies dataframe to anomalies_list
                    anomalies_list = []

                    # @added 20210316 - Feature #3978: luminosity - classify_metrics
                    #                   Feature #3642: Anomaly type classification
                    # Convert anomalies dataframe to anomalies_dict
                    anomalies_dict = {}
                    anomalies_dict['metric'] = base_name
                    anomalies_dict['timestamp'] = int(timeseries[-1][0])
                    anomalies_dict['from_timestamp'] = int(timeseries[0][0])
                    anomalies_dict['algorithm'] = algorithm_name
                    anomalies_dict['anomalies'] = {}

                    for ts, value in timeseries:
                        if int(ts) in anomaly_timestamps:
                            anomalies_list.append([int(ts), value])
                            anomalies_dict['anomalies'][int(ts)] = value
                    anomalies = list(anomalies_list)

                    # @added 20210316 - Feature #3978: luminosity - classify_metrics
                    #                   Feature #3642: Anomaly type classification
                    if save_plots_to:
                        try:
                            from adtk.visualization import plot
                            metric_dir = base_name.replace('.', '/')
                            timestamp_dir = str(int(timeseries[-1][0]))
                            save_path = '%s/%s/%s/%s' % (
                                save_plots_to, algorithm_name, metric_dir,
                                timestamp_dir)
                            if save_plots_to_absolute_dir:
                                save_path = '%s' % save_plots_to
                            anomalies_dict['file_path'] = save_path
                            save_to_file = '%s/%s.%s.png' % (
                                save_path, algorithm_name, base_name)
                            if filename_prefix:
                                save_to_file = '%s/%s.%s.%s.png' % (
                                    save_path, filename_prefix, algorithm_name,
                                    base_name)
                            save_to_path = os_path_dirname(save_to_file)
                            title = '%s\n%s' % (algorithm_name, base_name)
                            if not os_path_exists(save_to_path):
                                try:
                                    mkdir_p(save_to_path)
                                except Exception as e:
                                    current_logger.error('error :: %s :: failed to create dir - %s - %s' % (
                                        algorithm_name, save_to_path, e))
                            if os_path_exists(save_to_path):
                                try:
                                    plot(s, anomaly=anomaly_df, anomaly_color='red', title=title, save_to_file=save_to_file)
                                    if debug_logging:
                                        current_logger.debug('debug :: %s :: plot saved to - %s' % (
                                            algorithm_name, save_to_file))
                                except Exception as e:
                                    current_logger.error('error :: %s :: failed to plot - %s - %s' % (
                                        algorithm_name, base_name, e))
                            anomalies_file = '%s/%s.%s.anomalies_list.txt' % (
                                save_path, algorithm_name, base_name)
                            with open(anomalies_file, 'w') as fh:
                                fh.write(str(anomalies_list))
                                # os.chmod(anomalies_file, mode=0o644)
                            data_file = '%s/data.txt' % (save_path)
                            with open(data_file, 'w') as fh:
                                fh.write(str(anomalies_dict))
                        except SystemExit as e:
                            if debug_logging:
                                current_logger.debug('debug_logging :: %s :: SystemExit called during save plot, exiting - %s' % (
                                    algorithm_name, e))
                            if return_anomalies:
                                return (anomalous, anomalyScore, anomalies)
                            else:
                                return (anomalous, anomalyScore)
                        except Exception as e:
                            traceback_msg = traceback.format_exc()
                            record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
                            if debug_logging:
                                current_logger.error(traceback_msg)
                                current_logger.error('error :: %s :: failed to plot or save anomalies file - %s - %s' % (
                                    algorithm_name, base_name, e))
            else:
                anomalies = []

            # @added 20210318 - Feature #3978: luminosity - classify_metrics
            #                   Feature #3642: Anomaly type classification
            if anomalies and run_PersistAD and not realtime_analysis:
                persist_ad_algorithm_parameters = {}
                try:
                    persist_ad_algorithm_parameters = algorithm_parameters['persist_ad_algorithm_parameters']
                except:
                    pass
                persist_ad_window = 20
                try:
                    persist_ad_window = persist_ad_algorithm_parameters['window']
                except:
                    pass
                persist_ad_c = 9.9
                try:
                    persist_ad_c = persist_ad_algorithm_parameters['c']
                except:
                    pass
                try:
                    from adtk.detector import PersistAD
                    persist_ad = PersistAD(c=persist_ad_c, side='both', window=persist_ad_window)
                    persist_ad_anomaly_df = persist_ad.fit_detect(s)
                    persist_ad_anomalies = persist_ad_anomaly_df.loc[persist_ad_anomaly_df['value'] > 0]
                    if len(persist_ad_anomalies) > 0:
                        current_logger.info('%s :: %s anomalies found with PersistAD on %s' % (
                            algorithm_name, str(len(persist_ad_anomalies)),
                            base_name))
                        persist_ad_anomaly_timestamps = list(persist_ad_anomalies.index.astype(np.int64) // 10**9)
                        # Convert persist_ad_anomalies dataframe to persist_ad_anomalies_list
                        persist_ad_anomalies_list = []
                        persist_ad_anomalies_dict = {}
                        persist_ad_anomalies_dict['metric'] = base_name
                        persist_ad_anomalies_dict['timestamp'] = int(timeseries[-1][0])
                        persist_ad_anomalies_dict['from_timestamp'] = int(timeseries[0][0])
                        persist_ad_anomalies_dict['algorithm'] = 'adtk_PersistAD'
                        persist_ad_anomalies_dict['anomalies'] = {}

                        for ts, value in timeseries:
                            if int(ts) in persist_ad_anomaly_timestamps:
                                persist_ad_anomalies_list.append([int(ts), value])
                                persist_ad_anomalies_dict['anomalies'][int(ts)] = value
                        persist_ad_anomalies = list(persist_ad_anomalies_list)
                        if save_plots_to:
                            try:
                                from adtk.visualization import plot
                                metric_dir = base_name.replace('.', '/')
                                timestamp_dir = str(int(timeseries[-1][0]))
                                save_path = '%s/%s/%s/%s' % (
                                    save_plots_to, algorithm_name, metric_dir,
                                    timestamp_dir)
                                if save_plots_to_absolute_dir:
                                    save_path = '%s' % save_plots_to
                                persist_ad_anomalies_dict['file_path'] = save_path
                                save_to_file = '%s/%s.PersistAD.%s.png' % (
                                    save_path, algorithm_name, base_name)
                                if filename_prefix:
                                    save_to_file = '%s/%s.%s.%s.png' % (
                                        save_path, filename_prefix, algorithm_name,
                                        base_name)
                                save_to_path = os_path_dirname(save_to_file)
                                title = '%s - PersistAD verification\n%s' % (algorithm_name, base_name)
                                if not os_path_exists(save_to_path):
                                    try:
                                        mkdir_p(save_to_path)
                                    except Exception as e:
                                        current_logger.error('error :: %s :: failed to create dir - %s - %s' % (
                                            algorithm_name, save_to_path, e))
                                if os_path_exists(save_to_path):
                                    try:
                                        plot(s, anomaly=persist_ad_anomaly_df, anomaly_color='red', title=title, save_to_file=save_to_file)
                                        if debug_logging:
                                            current_logger.debug('debug :: %s :: plot saved to - %s' % (
                                                algorithm_name, save_to_file))
                                    except Exception as e:
                                        current_logger.error('error :: %s :: failed to plot - %s - %s' % (
                                            algorithm_name, base_name, e))
                                anomalies_file = '%s/%s.%s.PersistAD.anomalies_list.txt' % (
                                    save_path, algorithm_name, base_name)
                                with open(anomalies_file, 'w') as fh:
                                    fh.write(str(persist_ad_anomalies))
                                    # os.chmod(anomalies_file, mode=0o644)
                                data_file = '%s/PersistAD.data.txt' % (save_path)
                                with open(data_file, 'w') as fh:
                                    fh.write(str(persist_ad_anomalies_dict))
                            except Exception as e:
                                traceback_msg = traceback.format_exc()
                                record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
                                if debug_logging:
                                    current_logger.error(traceback_msg)
                                    current_logger.error('error :: %s :: failed to plot or save PersistAD anomalies file - %s - %s' % (
                                        algorithm_name, base_name, e))
                except Exception as e:
                    traceback_msg = traceback.format_exc()
                    record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
                    if debug_logging:
                        current_logger.error(traceback_msg)
                        current_logger.error('error :: %s :: failed to analysis with PersistAD anomalies file - %s - %s' % (
                            algorithm_name, base_name, e))
            try:
                del df
            except:
                pass
        except SystemExit as e:
            if debug_logging:
                current_logger.debug('debug_logging :: %s :: SystemExit called, during analysis, exiting - %s' % (
                    algorithm_name, e))
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            else:
                return (anomalous, anomalyScore)
        except:
            traceback_msg = traceback.format_exc()
            record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback_msg)
            if debug_logging:
                current_logger.error(traceback_msg)
                current_logger.error('error :: debug_logging :: %s :: failed to run on ts' % (
                    algorithm_name))
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            else:
                return (anomalous, anomalyScore)

        end_analysis = timer()
        analysis_runtime = end_analysis - start_analysis

        if debug_logging:
            current_logger.debug('debug :: %s :: LevelShiftAD took %.6f seconds' % (
                algorithm_name, analysis_runtime))

        if anomalous:
            anomalyScore = 1.0
        else:
            anomalyScore = 0.0

        if debug_logging:
            current_logger.info('%s :: anomalous - %s, anomalyScore - %s' % (
                algorithm_name, str(anomalous), str(anomalyScore)))

        if debug_logging:
            end = timer()
            processing_runtime = end - start
            current_logger.info('%s :: completed analysis in %.6f seconds' % (
                algorithm_name, processing_runtime))
        try:
            del timeseries
        except:
            pass
        if return_anomalies:
            return (anomalous, anomalyScore, anomalies)
        else:
            return (anomalous, anomalyScore)

    except SystemExit as e:
        if debug_logging:
            current_logger.debug('debug_logging :: %s :: SystemExit called (before StopIteration), exiting - %s' % (
                algorithm_name, e))
        if return_anomalies:
            return (anomalous, anomalyScore, anomalies)
        else:
            return (anomalous, anomalyScore)
    except StopIteration:
        # This except pattern MUST be used in ALL custom algortihms to
        # facilitate the traceback from any errors.  The algorithm we want to
        # run super fast and without spamming the log with lots of errors.
        # But we do not want the function returning and not reporting
        # anything to the log, so the pythonic except is used to "sample" any
        # algorithm errors to a tmp file and report once per run rather than
        # spewing tons of errors into the log e.g. analyzer.log
        if return_anomalies:
            return (False, None, anomalies)
        else:
            return (False, None)
    except:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name, traceback.format_exc())
        # Return None and None as the algorithm could not determine True or False
        if return_anomalies:
            return (False, None, anomalies)
        else:
            return (False, None)

    if return_anomalies:
        return (anomalous, anomalyScore, anomalies)
    else:
        return (anomalous, anomalyScore)
def get_external_alert_configs(current_skyline_app):
    """
    Return a concatenated alerts configs from :mod:`settings.EXTERNAL_ALERTS` of
    any fetched external alert configs, a all_alerts list which is a concentated
    and deduplicated list of the and whether it was retrieved from cache
    or fetched source.

    :param current_skyline_app: the app calling the function so the function
        knows which log to write too.
    :type current_skyline_app: str
    :return: (external_alert_configs, all_alerts, external_from_cache)
    :rtype: (dict, list, boolean)

    """
    debug_get_external_alert_configs = None

    # Set the default dicts to return
    external_alert_configs = {}
    # Set the default dict to return
    internal_alert_configs = {}
    # Set the default all_alerts to return
    all_alerts = list(settings.ALERTS)
    all_alert_configs = None
    # Set the default external_from_cache to return
    external_from_cache = None
    # Set the default internal_from_cache to return
    internal_from_cache = None
    # Set the default all_from_cache to return
    all_from_cache = None

    last_known_redis_key = 'skyline.last_known.external_alert_configs'

    # Get the logger
    current_skyline_app_logger = str(current_skyline_app) + 'Log'
    current_logger = logging.getLogger(current_skyline_app_logger)

    # Define the items that are expected in the external alert config json
    EXTERNAL_ALERTS_JSON_ITEMS = ('alerter', 'expiration', 'namespace',
                                  'namespace_prefix',
                                  'second_order_resolution',
                                  'second_order_resolution_hours',
                                  'learn_days', 'inactive_after')
    OPTIONAL_EXTERNAL_ALERTS_JSON_ITEMS = ('namespace_prefix',
                                           'second_order_resolution_hours',
                                           'learn_days', 'inactive_after')

    try:
        EXTERNAL_ALERTS = settings.EXTERNAL_ALERTS
        if debug_get_external_alert_configs:
            current_logger.debug(
                'debug :: get_external_alert_configs settings.EXTERNAL_ALERTS is defined'
            )
    except:
        return (external_alert_configs, external_from_cache,
                internal_alert_configs, internal_from_cache, tuple(all_alerts),
                all_from_cache)

    redis_conn_decoded = None
    try:
        redis_conn_decoded = get_redis_conn_decoded(current_skyline_app)
    except:
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: get_external_alert_configs :: failed to get decoded Redis connection'
        )

    # The all_alert_configs Redis key is cached for 60 seconds, if found return
    # as it is all that is needed
    redis_key = 'skyline.all_alert_configs'
    raw_all_alert_configs = None
    if redis_conn_decoded:
        try:
            raw_all_alert_configs = redis_conn_decoded.get(redis_key)
        except:
            current_logger.error(traceback.format_exc())
            current_logger.error(
                'error :: get_external_alert_configs :: failed to query Redis for skyline.all_alert_configs'
            )
    if raw_all_alert_configs:
        try:
            all_alert_configs = literal_eval(raw_all_alert_configs)
        except:
            current_logger.error(traceback.format_exc())
            current_logger.error(
                'error :: get_external_alert_configs :: failed to literal_eval skyline.all_alert_configs'
            )
    if all_alert_configs:
        # Set that the external_alert_config was fetched from cache
        all_from_cache = True
        return (external_alert_configs, external_from_cache,
                internal_alert_configs, internal_from_cache, all_alert_configs,
                all_from_cache)

    redis_key = 'skyline.external_alert_configs'
    raw_external_alert_configs = None
    if redis_conn_decoded:
        try:
            raw_external_alert_configs = redis_conn_decoded.get(redis_key)
        except:
            current_logger.error(traceback.format_exc())
            current_logger.error(
                'error :: get_external_alert_configs :: failed to query Redis for skyline.external_alert_configs'
            )
    if raw_external_alert_configs:
        try:
            external_alert_configs = literal_eval(raw_external_alert_configs)
        except:
            current_logger.error(traceback.format_exc())
            current_logger.error(
                'error :: get_external_alert_configs :: failed to literal_eval skyline.external_alert_configs'
            )
    if external_alert_configs:
        # Set that the external_alert_config was fetched from cache
        external_from_cache = True
        if redis_conn_decoded:
            try:
                redis_conn_decoded.set(last_known_redis_key,
                                       str(external_alert_configs))
            except:
                current_logger.error(traceback.format_exc())
                current_logger.error(
                    'error :: get_external_alert_configs :: failed to set %s Redis key'
                    % last_known_redis_key)

    redis_key = 'skyline.internal_alert_configs'
    raw_internal_alert_configs = None
    if redis_conn_decoded:
        try:
            raw_internal_alert_configs = redis_conn_decoded.get(redis_key)
        except:
            current_logger.error(traceback.format_exc())
            current_logger.error(
                'error :: get_external_alert_configs :: failed to query Redis for skyline.internal_alert_configs'
            )
    if raw_internal_alert_configs:
        try:
            internal_alert_configs = literal_eval(raw_internal_alert_configs)
        except:
            current_logger.error(traceback.format_exc())
            current_logger.error(
                'error :: get_external_alert_configs :: failed to literal_eval skyline.internal_alert_configs'
            )
    if internal_alert_configs:
        # Set that the external_alert_config was fetched from cache
        internal_from_cache = True

    # If the external_alert_config was not fectched from cache build it
    if not external_alert_configs:
        for external_alert_config in EXTERNAL_ALERTS:
            external_alert_config_url = None
            try:
                external_alert_config_url = EXTERNAL_ALERTS[
                    external_alert_config]['url']
            except:
                current_logger.error(traceback.format_exc())
                current_logger.error(
                    'error :: get_external_alert_configs :: could not determine url from EXTERNAL_ALERTS[\'%s\'][\'url\']'
                    % (str(external_alert_config)))
                continue
            external_alert_config_method = None
            try:
                external_alert_config_method = EXTERNAL_ALERTS[
                    external_alert_config]['method']
            except:
                current_logger.error(traceback.format_exc())
                current_logger.error(
                    'error :: get_external_alert_configs :: could not determine url from EXTERNAL_ALERTS[\'%s\'][\'method\']'
                    % (str(external_alert_config)))
                continue
            external_alert_config_post_data = None
            if external_alert_config_method == 'POST' or external_alert_config_method == 'post':
                try:
                    external_alert_config_post_data = EXTERNAL_ALERTS[
                        external_alert_config]['data']
                except:
                    external_alert_config_post_data = None
            external_alert_json = None
            try:
                current_logger.info(
                    'get_external_alert_configs :: retrieving alert config json for %s from %s via %s'
                    % (str(external_alert_config),
                       str(external_alert_config_url),
                       str(external_alert_config_method)))
                if external_alert_config_method == 'GET':
                    r = requests.get(external_alert_config_url, timeout=2)
                if external_alert_config_method == 'POST':
                    header = {"content-type": "application/json"}
                    if external_alert_config_post_data:
                        r = requests.post(
                            external_alert_config_url,
                            data=json.dumps(external_alert_config_post_data),
                            headers=header,
                            timeout=2)
                    else:
                        r = requests.post(external_alert_config_url,
                                          headers=header,
                                          timeout=2)
                external_alert_json = r.json()
            except:
                current_logger.error(traceback.format_exc())
                current_logger.error(
                    'error :: get_external_alert_configs :: could not retrieve json from the url - %s'
                    % str(external_alert_config_url))
                continue
            if not external_alert_json:
                current_logger.error(
                    'error :: get_external_alert_configs :: did not retrieve json from the url - %s'
                    % str(external_alert_config_url))
                continue

            for alerter_id in external_alert_json['data']:
                config_id = 'external-%s' % str(alerter_id)
                alerter_config = {'id': config_id}
                namespace_prefix = None
                namespace = None
                for key in EXTERNAL_ALERTS_JSON_ITEMS:
                    try:
                        if key == 'namespace_prefix':
                            try:
                                namespace_prefix = external_alert_json['data'][
                                    alerter_id][key]
                            except:
                                namespace_prefix = None
                        elif key == 'namespace':
                            namespace = external_alert_json['data'][
                                alerter_id][key]
                        else:
                            alerter_config[key] = external_alert_json['data'][
                                alerter_id][key]
                    except:
                        if key in OPTIONAL_EXTERNAL_ALERTS_JSON_ITEMS:
                            if key == 'inactive_after':
                                alerter_config[key] = 7200
                            continue
                        else:
                            current_logger.error(traceback.format_exc())
                            current_logger.error(
                                'error :: get_external_alert_configs :: could not determine %s from json - %s'
                                % (key, str(alerter_id)))
                            alerter_config = {}
                            break
                if alerter_config:
                    try:
                        if namespace_prefix == namespace:
                            full_namespace_str = namespace
                        else:
                            if namespace_prefix is None:
                                full_namespace_str = namespace
                            else:
                                full_namespace_str = '%s.%s' % (
                                    namespace_prefix, namespace)
                            full_namespace = full_namespace_str.replace(
                                ',', '.')
                        alerter_config['namespace'] = full_namespace
                    except:
                        current_logger.error(traceback.format_exc())
                        current_logger.error(
                            'error :: get_external_alert_configs :: failed to interpolation full_namespace from namespace_prefix and namespace in the json - %s'
                            % str(external_alert_json['data'][alerter_id]))
                        continue
                    try:
                        alerter_config['type'] = 'external'
                    except:
                        current_logger.error(traceback.format_exc())
                        current_logger.error(
                            'error :: get_external_alert_configs :: failed to add type external to alerter_config'
                        )
                        continue
                    try:
                        external_alert_configs[alerter_id] = alerter_config
                    except:
                        current_logger.error(traceback.format_exc())
                        current_logger.error(
                            'error :: get_external_alert_configs :: could not add alert_config dict to external_alert_configs dict from json - %s'
                            % str(external_alert_json['data'][alerter_id]))
                        continue

    # If the key expired and no alerter_configs were constructed from the
    # external source then use the last known good external_alert_configs
    last_good_external_alert_configs = None
    if not external_alert_configs:
        if redis_conn_decoded:
            last_good_raw_external_alert_configs = None
            try:
                last_good_raw_external_alert_configs = redis_conn_decoded.get(
                    last_known_redis_key)
            except:
                current_logger.error(traceback.format_exc())
                current_logger.error(
                    'error :: get_external_alert_configs :: failed to query Redis for %s'
                    % last_known_redis_key)
            last_good_external_alert_configs = None
            if last_good_raw_external_alert_configs:
                try:
                    last_good_external_alert_configs = literal_eval(
                        last_good_raw_external_alert_configs)
                except:
                    current_logger.error(traceback.format_exc())
                    current_logger.error(
                        'error :: get_external_alert_configs :: failed to literal_eval skyline.last_known.external_alert_configs'
                    )
            if last_good_external_alert_configs:
                current_logger.info(
                    'get_external_alert_configs :: failed to construct the external_alert_configs using skyline.last_known.external_alert_configs'
                )
                external_alert_configs = last_good_external_alert_configs
                external_from_cache = True

    # Build the all_alerts list by contenating the external_alert_configs
    new_all_alerts = []

    if external_alert_configs:
        # external smtp alerts
        # All set to no_email in analyzer and mirage_alerters as every alert
        # must be routed through the smtp workflow, even if it does not send a
        # smtp alert, as the smtp alert route creates the the training data
        # resources.
        for external_alert_config in external_alert_configs:
            config_id = None
            namespace = None
            expiration = None
            second_order_resolution = None
            second_order_resolution_hours = None
            try:
                config_id = external_alert_configs[external_alert_config]['id']
            except:
                continue
            try:
                namespace = external_alert_configs[external_alert_config][
                    'namespace']
            except:
                continue
            try:
                expiration = int(external_alert_configs[external_alert_config]
                                 ['expiration'])
            except:
                continue
            try:
                second_order_resolution = int(
                    external_alert_configs[external_alert_config]
                    ['second_order_resolution'])
                second_order_resolution_hours = int(second_order_resolution /
                                                    3600)
            except:
                continue

            # First add an smtp no_email alerter for the external_alert_config
            # this is required to route anomalies through the training_data
            # resources creation workflow
            # alert = ('metric5.thing.*.rpm', 'smtp', 900, 168),
            new_all_alerts.append([
                namespace, 'smtp', expiration, second_order_resolution_hours,
                external_alert_configs[external_alert_config]
            ])

    # internal smtp alerts
    for index, alert in enumerate(settings.ALERTS):
        # alert = ('metric5.thing.*.rpm', 'smtp', 900, 168),
        if str(alert[1]) == 'smtp':
            try:
                second_order_resolution_hours = int(alert[3])
                second_order_resolution = second_order_resolution_hours * 3600
            except:
                second_order_resolution = 0
            config_id = 'internal-%s' % str(index)
            internal_alert_config = {
                'id': config_id,
                'alerter': alert[1],
                'namespace': alert[0],
                'expiration': alert[2],
                'second_order_resolution': second_order_resolution,
                'inactive_after': 7200,
                'type': 'internal'
            }
            new_all_alerts.append([
                alert[0], alert[1], alert[2], second_order_resolution_hours,
                internal_alert_config
            ])
            try:
                internal_alert_configs[index] = internal_alert_config
            except:
                current_logger.error(traceback.format_exc())
                current_logger.error(
                    'error :: get_external_alert_configs :: could not add internal_alert_config dict to internal_alert_configs dict'
                )
                continue

    # external alerts - non-smtp
    if external_alert_configs:
        for external_alert_config in external_alert_configs:
            config_id = None
            alerter = None
            namespace = None
            expiration = None
            second_order_resolution = None
            second_order_resolution_hours = 0
            try:
                config_id = external_alert_configs[external_alert_config]['id']
            except:
                continue
            try:
                alerter = external_alert_configs[external_alert_config][
                    'alerter']
            except:
                continue
            try:
                namespace = external_alert_configs[external_alert_config][
                    'namespace']
            except:
                continue
            try:
                expiration = int(external_alert_configs[external_alert_config]
                                 ['expiration'])
            except:
                continue
            try:
                second_order_resolution = int(
                    external_alert_configs[external_alert_config]
                    ['second_order_resolution'])
                second_order_resolution_hours = int(second_order_resolution /
                                                    3600)
            except:
                continue

            # First add an smtp no_email alerter for the external_alert_config
            # this is required to route anomalies through the training_data
            # resources creation workflow
            # alert = ('metric5.thing.*.rpm', 'smtp', 900, 168),
            new_all_alerts.append([
                namespace, alerter, expiration, second_order_resolution_hours,
                external_alert_configs[external_alert_config]
            ])

    # internal non smtp alerts
    for index, alert in enumerate(settings.ALERTS):
        # alert = ('metric5.thing.*.rpm', 'smtp', 900, 168),
        if str(alert[1]) != 'smtp':
            try:
                second_order_resolution_hours = int(alert[3])
                second_order_resolution = second_order_resolution_hours * 3600
            except:
                second_order_resolution_hours = 0
            config_id = 'internal-%s' % str(index)
            internal_alert_config = {
                'id': config_id,
                'alerter': alert[1],
                'namespace': alert[0],
                'expiration': alert[2],
                'second_order_resolution': second_order_resolution,
                'inactive_after': 7200,
                'type': 'internal'
            }
            new_all_alerts.append([
                alert[0], alert[1], alert[2], second_order_resolution_hours,
                internal_alert_config
            ])
            try:
                internal_alert_configs[index] = internal_alert_config
            except:
                current_logger.error(traceback.format_exc())
                current_logger.error(
                    'error :: get_external_alert_configs :: could not add internal_alert_config dict to internal_alert_configs dict'
                )
                continue

    if new_all_alerts:
        all_alerts = tuple(new_all_alerts)

    if redis_conn_decoded and external_alert_configs:
        if not external_from_cache:
            redis_key = 'skyline.external_alert_configs'
            try:
                redis_conn_decoded.setex(redis_key, 300,
                                         str(external_alert_configs))
            except:
                current_logger.error(traceback.format_exc())
                current_logger.error(
                    'error :: get_external_alert_configs :: failed to set %s' %
                    redis_key)
    if redis_conn_decoded and internal_alert_configs:
        if not internal_from_cache:
            redis_key = 'skyline.internal_alert_configs'
            try:
                redis_conn_decoded.setex(redis_key, 60,
                                         str(internal_alert_configs))
            except:
                current_logger.error(traceback.format_exc())
                current_logger.error(
                    'error :: get_external_alert_configs :: failed to set %s' %
                    redis_key)
    if redis_conn_decoded and all_alerts:
        if not all_from_cache:
            redis_key = 'skyline.all_alert_configs'
            try:
                redis_conn_decoded.setex(redis_key, 60, str(all_alerts))
            except:
                current_logger.error(traceback.format_exc())
                current_logger.error(
                    'error :: get_external_alert_configs :: failed to set %s' %
                    redis_key)

    return (external_alert_configs, external_from_cache,
            internal_alert_configs, internal_from_cache, all_alerts,
            all_from_cache)
Beispiel #20
0
    def run(self):
        """
        Called when the process intializes.
        """
        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        # In Vista the log management is handled be fetcher, the worker just
        # waits for the fetcher to do the log managment
        now = int(time())
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = int(time())
            else:
                now = log_wait_for + 1

        logger.info('worker :: starting log management')
        if os.path.isfile(skyline_app_loglock):
            logger.error(
                'error :: worker :: bin/%s.d log management seems to have failed, continuing'
                % skyline_app)
            try:
                os_remove(skyline_app_loglock)
                logger.info('worker :: log lock file removed')
            except OSError:
                logger.error(
                    'error :: worker :: failed to remove %s, continuing' %
                    skyline_app_loglock)
                pass
        else:
            logger.info('worker :: bin/%s.d log management done' % skyline_app)

        logger.info('worker :: starting worker')

        try:
            VISTA_ENABLED = settings.VISTA_ENABLED
            logger.info('worker :: VISTA_ENABLED is set to %s' %
                        str(VISTA_ENABLED))
        except:
            VISTA_ENABLED = False
            logger.info(
                'worker :: warning :: VISTA_ENABLED is not declared in settings.py, defaults to False'
            )

        last_sent_to_graphite = int(time())
        metrics_sent_to_flux = 0

        # python-2.x and python3.x handle while 1 and while True differently
        # while 1:
        running = True
        while running:

            # Make sure Redis is up
            redis_up = False
            while not redis_up:
                try:
                    redis_up = self.redis_conn.ping()
                    if LOCAL_DEBUG:
                        logger.info('worker :: redis is up')
                except:
                    logger.error(
                        'worker :: cannot connect to redis at socket path %s' %
                        (settings.REDIS_SOCKET_PATH))
                    sleep(2)

                    # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings
                    #                      Branch #3262: py3
                    # if settings.REDIS_PASSWORD:
                    #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
                    # else:
                    #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                    self.redis_conn = get_redis_conn(skyline_app)
                    self.redis_conn_decoded = get_redis_conn_decoded(
                        skyline_app)

            metrics_data = []
            redis_set = 'vista.fetcher.metrics.json'
            try:
                # Get a metric to validate from the Redis set

                # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings
                #                      Branch #3262: py3
                # metrics_data = self.redis_conn.smembers(redis_set)
                metrics_data = self.redis_conn_decoded.smembers(redis_set)

                if LOCAL_DEBUG:
                    logger.info('worker :: got redis set data - %s' %
                                redis_set)
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: worker :: retrieving Redis set %s data' %
                    str(redis_set))

            if not metrics_data:
                if LOCAL_DEBUG:
                    logger.info('worker :: no data from Redis set %s' %
                                str(redis_set))
                sleep(5)

            for str_metric_data in metrics_data:
                delete_set_record = False
                remote_host_type = None
                try:

                    # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings
                    #                      Branch #3262: py3
                    # Rather using get_redis_conn_decoded
                    # if python_version == 3:
                    #     str_metric_data = str_metric_data.decode('UTF-8')

                    metric_data = literal_eval(str_metric_data)
                    remote_host_type = str(metric_data[0]['remote_host_type'])
                    if LOCAL_DEBUG:
                        logger.info(
                            'worker :: got data from Redis set for remote_host_type %s'
                            % str(remote_host_type))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to determine remote_host_type from %s'
                        % str(str_metric_data))
                    delete_set_record = True
                if not delete_set_record:
                    try:
                        remote_target = str(metric_data[0]['remote_target'])
                        if LOCAL_DEBUG:
                            logger.info(
                                'worker :: got data from Redis set for target %s'
                                % str(remote_target))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to determine target from %s'
                            % str(str_metric_data))
                        delete_set_record = True
                metric = None
                if not delete_set_record:
                    try:
                        metric = str(metric_data[0]['metric'])
                        if LOCAL_DEBUG:
                            logger.info(
                                'worker :: got data from Redis set for metric %s'
                                % str(metric))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to determine metric from %s'
                            % str(str_metric_data))
                        delete_set_record = True

                namespace_prefix = ''
                if not delete_set_record:
                    try:
                        namespace_prefix = str(
                            metric_data[0]['namespace_prefix'])
                        namespace_prefix = '%s.' % namespace_prefix
                        if not namespace_prefix:
                            namespace_prefix = ''
                        if LOCAL_DEBUG:
                            logger.info(
                                'worker :: got data from Redis set for namespace_prefix %s'
                                % str(namespace_prefix))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to determine namespace_prefix from %s'
                            % str(str_metric_data))
                        delete_set_record = True

                have_data = False
                if not delete_set_record:
                    last_flux_metric_data = None
                    cache_key = 'flux.last.%s' % (metric)
                    try:
                        if python_version == 3:
                            redis_last_flux_metric_data = self.redis_conn.get(
                                cache_key).decode('UTF-8')
                        else:
                            redis_last_flux_metric_data = self.redis_conn.get(
                                cache_key)
                        redis_last_flux_metric_data = redis_last_flux_metric_data
                        last_flux_metric_data = literal_eval(
                            redis_last_flux_metric_data)
                        if LOCAL_DEBUG:
                            logger.info(
                                'worker :: got last_flux_metric_data from Redis'
                            )
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: retrieving Redis key %s data' %
                            str(cache_key))
                        last_flux_metric_data = False

                    last_flux_timestamp = None
                    if last_flux_metric_data:
                        try:
                            last_flux_timestamp = int(last_flux_metric_data[0])
                            if LOCAL_DEBUG:
                                logger.info(
                                    'worker :: got last_flux_timestamp - %s' %
                                    str(last_flux_timestamp))
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed determining last_flux_timestamp'
                            )
                            last_flux_timestamp = False

                    # Determine the timestamp of the current minute to apply
                    # VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE
                    time_now = int(time())
                    # current_minute = datetime.datetime.utcfromtimestamp(time_now).strftime('%Y-%m-%d %H:%M')
                    current_minute_hour = int(
                        datetime.datetime.utcfromtimestamp(time_now).strftime(
                            '%H'))
                    current_minute_minute = int(
                        datetime.datetime.utcfromtimestamp(time_now).strftime(
                            '%M'))
                    current_datetime = datetime.datetime.utcfromtimestamp(
                        time_now).replace(hour=current_minute_hour,
                                          minute=current_minute_minute,
                                          second=0,
                                          microsecond=0)
                    current_minute_timestamp_start = int(
                        current_datetime.strftime('%s'))

                    datapoint = None
                    last_timestamp_with_data = None
                    timeseries = []

                    # @added 20200107 - Task #3376: Enable vista and flux to deal with lower frequency data
                    metric_resolution = 60
                    metric_resolution_determined = False

                    try:
                        if python_version == 3:
                            datapoints_str = literal_eval(
                                metric_data[0]['datapoints'])
                            metric_datapoints = literal_eval(datapoints_str)
                        else:
                            # metric_datapoints = metric_data[0]['datapoints']
                            datapoints_str = literal_eval(
                                metric_data[0]['datapoints'])
                            metric_datapoints = literal_eval(datapoints_str)
                        # for value, timestamp in metric_data[0]['datapoints']:
                        if LOCAL_DEBUG:
                            len_metric_datapoints = len(metric_datapoints)
                            logger.info(
                                'worker :: got %s metric_datapoints - %s' %
                                (str(len_metric_datapoints),
                                 str(metric_datapoints)))

                        # @added 20200107 - Task #3376: Enable vista and flux to deal with lower frequency data
                        # Determine resolution
                        resolution_timestamps = []
                        for metric_datapoint in metric_datapoints:
                            timestamp = int(metric_datapoint[0])
                            resolution_timestamps.append(timestamp)
                        timestamp_resolutions = []
                        if resolution_timestamps:
                            last_timestamp = None
                            for timestamp in resolution_timestamps:
                                if last_timestamp:
                                    resolution = timestamp - last_timestamp
                                    timestamp_resolutions.append(resolution)
                                    last_timestamp = timestamp
                                else:
                                    last_timestamp = timestamp
                        if timestamp_resolutions:
                            try:
                                timestamp_resolutions_count = Counter(
                                    timestamp_resolutions)
                                ordered_timestamp_resolutions_count = timestamp_resolutions_count.most_common(
                                )
                                metric_resolution = int(
                                    ordered_timestamp_resolutions_count[0][0])
                                if metric_resolution > 0:
                                    metric_resolution_determined = True
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: worker :: failed to determine metric_resolution from %s'
                                    % (str(metric_data)))
                        if metric_resolution_determined:
                            cache_key = 'vista.last.resolution.%s' % metric
                            try:
                                # Update Redis key
                                self.redis_conn.setex(cache_key, 3600,
                                                      metric_resolution)
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: fetcher :: failed to set Redis key - %s'
                                    % (cache_key))

                        for metric_datapoint in metric_datapoints:
                            # @20191010 - Branch #3140: vista
                            # fetcher passes through preformatted data points that
                            # are in the same format/order for both graphite and
                            # prometheus
                            # if remote_host_type == 'graphite':
                            #     value = float(metric_datapoint[0])
                            #     timestamp = int(metric_datapoint[1])
                            # if remote_host_type == 'prometheus':
                            #     value = float(metric_datapoint[1])
                            #     timestamp = int(metric_datapoint[0])
                            timestamp = int(metric_datapoint[0])
                            value = float(metric_datapoint[1])

                            append_to_timeseries = False
                            if last_flux_timestamp:
                                if int(timestamp) > last_flux_timestamp:
                                    # timeseries.append([timestamp, value])
                                    append_to_timeseries = True
                            else:
                                # timeseries.append([timestamp, value])
                                append_to_timeseries = True

                            # Here if the timestamp of the data point falls
                            # within the current minute, it is discarded and not
                            # sent to flux, to ensure that high frequency metrics
                            # can have their minutely bins fully populated before
                            # they are submitted to Graphite
                            if settings.VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE:
                                if int(timestamp
                                       ) >= current_minute_timestamp_start:
                                    append_to_timeseries = False
                            if append_to_timeseries:
                                timeseries.append([timestamp, value])

                        last_timestamp_with_data = 0
                        for timestamp, value in timeseries[::-1]:
                            has_value = False
                            if value == 0.0:
                                has_value = True
                            if value:
                                has_value = True
                            if has_value:
                                last_timestamp_with_data = int(timestamp)
                                datapoint = value
                                break
                        if last_timestamp_with_data:
                            have_data = True
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to determine datapoints from %s'
                            % (str(metric_data)))
                        delete_set_record = True
                if not timeseries:
                    logger.info(
                        'worker :: after processing, there were no valid data points in %s'
                        % (str(metric_data)))
                    delete_set_record = True
                if not have_data and timeseries:
                    logger.error(
                        'error :: worker :: failed to determine last_timestamp_with_data from %s'
                        % (str(metric_data)))
                    delete_set_record = True
                if delete_set_record:
                    try:
                        redis_set = 'vista.fetcher.metrics.json'
                        self.redis_conn.srem(redis_set, str_metric_data)
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to delete data from Redis set %s, data - '
                            % (str(redis_set), str(str_metric_data)))
                    continue

                if not metric:
                    continue

                valid_data = True
                if last_flux_timestamp and last_timestamp_with_data:
                    if int(last_timestamp_with_data) <= last_flux_timestamp:
                        valid_data = False
                if not valid_data:
                    redis_set = 'vista.fetcher.metrics.json'
                    logger.info(
                        'worker :: no valid data in fetched data removing from Redis set %s - data - %s'
                        % (redis_set, str(str_metric_data)))
                    try:
                        self.redis_conn.srem(redis_set, str_metric_data)
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to delete data from Redis set %s, data - %s'
                            % (redis_set, str(str_metric_data)))
                    continue

                if valid_data:
                    flux_host = 'http://%s:%s' % (settings.FLUX_IP,
                                                  settings.FLUX_PORT)

                    # Resample
                    resample_at = None
                    if resample_at == 'none' or resample_at == '0Min':
                        resample_at = False
                    if resample_at == 'None' or resample_at == '0min':
                        resample_at = False
                    if resample_at is None or resample_at == '0' or resample_at == 0:
                        resample_at = False
                    if resample_at:
                        try:
                            df = pd.DataFrame(timeseries)
                            df.columns = ['timestamp', 'value']
                            df['timestamp'] = pd.to_datetime(df['timestamp'],
                                                             unit='s',
                                                             origin='unix')
                            df = df.set_index('timestamp')
                            resampled_df = df.resample(resample_at).sum()
                            resampled_timeseries = []
                            for index, row in resampled_df.iterrows():
                                timestamp = int(index.strftime('%s'))
                                resampled_timeseries.append(
                                    [timestamp, row[0]])
                            timeseries = resampled_timeseries
                            timeseries_length = len(timeseries)
                            logger.info(
                                'worker :: time series resampled at %s resulting in %s data points to send to Graphite'
                                % (str(resample_at), str(timeseries_length)))
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to resample time series at %s for %s with time series %s'
                                % (str(resample_at), str(metric),
                                   str(timeseries)))

                    for timestamp, value in timeseries:
                        flux_url = '%s/metric_data?metric=%s&value=%s&timestamp=%s&key=%s' % (
                            flux_host, metric, str(datapoint), str(timestamp),
                            settings.FLUX_SELF_API_KEY)
                        success = False
                        try:
                            response = requests.get(flux_url)
                            if response.status_code == 200:
                                success = True
                            elif response.status_code == 204:
                                success = True
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to request %s' %
                                str(flux_url))
                        if not success:
                            logger.error(
                                'error :: worker :: http status code - %s, reason - %s'
                                % (str(response.status_code),
                                   str(response.reason)))

                    if success:
                        metrics_sent_to_flux += 1
                        redis_set = 'vista.fetcher.metrics.json'

                        # @added 20191011 - Task #3258: Reduce vista logging
                        timeseries_length = len(timeseries)

                        # @modified 20191011 - Task #3258: Reduce vista logging
                        # logger.info('worker :: data submitted to flux OK, removing data from Redis set %s' % (
                        #     redis_set))
                        logger.info(
                            'worker :: %s data points submitted to flux OK for %s'
                            % (str(timeseries_length), metric))
                        try:
                            self.redis_conn.srem(redis_set, str_metric_data)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to delete data from Redis set %s, data - %s'
                                % (redis_set, str(str_metric_data)))

                        redis_set = 'vista.fetcher.unique_metrics'
                        try:
                            self.redis_conn.sadd(redis_set, remote_target)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to add %s to Redis set %s'
                                % (remote_target, redis_set))

            time_now = int(time())
            if (time_now - last_sent_to_graphite) >= 60:
                logger.info(
                    'worker :: metrics sent_to_flux in last 60 seconds - %s' %
                    str(metrics_sent_to_flux))
                send_metric_name = '%s.metrics_sent_to_flux' % skyline_app_graphite_namespace
                try:
                    send_graphite_metric(parent_skyline_app, send_metric_name,
                                         str(metrics_sent_to_flux))
                    last_sent_to_graphite = int(time())
                    metrics_sent_to_flux = 0
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to send_graphite_metric %s with %s'
                        % (send_metric_name, str(metrics_sent_to_flux)))
Beispiel #21
0
def get_boundary_metrics(current_skyline_app,
                         metrics,
                         namespaces,
                         cluster_data=False,
                         log=False):
    """
    Determine all the boundary metrics and return a dictionary of them and
    their algorithms.

    :param current_skyline_app: the app calling the function
    :param metrics: a list of base_names
    :param namespaces: a list of namespace pattern to match
    :param cluster_data: whether this is a cluster_data request, optional,
        defaults to False
    :param log: whether to log or not, optional, defaults to False
    :type current_skyline_app: str
    :type metrics: list
    :type namespace: list
    :type cluster_data: boolean
    :type log: boolean
    :return: boundary_metrics
    :rtype: dict

    """

    boundary_metrics = {}
    function_str = 'get_boundary_metrics'

    filter_by_metrics = []

    if log:
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.info('%s :: %s :: determining boundary_metrics' %
                            (current_skyline_app, function_str))
    else:
        current_logger = None

    try:
        redis_conn_decoded = get_redis_conn_decoded(current_skyline_app)
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: %s :: get_redis_conn_decoded failed - %s' %
            (current_skyline_app, function_str, e))
        raise

    boundary_metrics_redis_dict = {}
    try:
        boundary_metrics_redis_dict = redis_conn_decoded.hgetall(
            'metrics_manager.boundary_metrics')
        if log:
            current_logger.info(
                '%s :: %s :: got %s boundary metrics from metrics_manager.boundary_metrics'
                % (current_skyline_app, function_str,
                   str(len(boundary_metrics_redis_dict))))
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: failed to get Redis hash key metrics_manager.boundary_metrics - %s'
            % (function_str, e))
        raise

    boundary_metrics = boundary_metrics_redis_dict.copy()

    remote_boundary_metrics = []
    if settings.REMOTE_SKYLINE_INSTANCES and cluster_data:
        boundary_metrics_uri = 'boundary_metrics'
        try:
            remote_boundary_metrics = get_cluster_data(boundary_metrics_uri,
                                                       'boundary_metrics')
        except Exception as e:
            if not log:
                current_skyline_app_logger = current_skyline_app + 'Log'
                current_logger = logging.getLogger(current_skyline_app_logger)
            current_logger.error(traceback.format_exc())
            current_logger.error(
                'error :: %s :: failed to get boundary_metrics from remote instances - %s'
                % (function_str, e))
            raise
        if remote_boundary_metrics:
            if log:
                current_logger.info(
                    'got %s remote boundary_metrics from the remote Skyline instances'
                    % str(len(remote_boundary_metrics)))
            for remote_data in remote_boundary_metrics:
                for base_name in list(remote_data.keys()):
                    boundary_metrics[base_name] = remote_data[base_name]

    if metrics:
        for metric in metrics:
            filter_by_metrics.append(metric)

    unique_base_names = []
    if namespaces:
        redis_key = 'analyzer.metrics_manager.db.metric_names'
        try:
            unique_base_names = list(redis_conn_decoded.smembers(redis_key))
            if unique_base_names:
                if log:
                    current_logger.info(
                        '%s :: %s :: got %s unique_base_names' %
                        (current_skyline_app, function_str,
                         str(len(unique_base_names))))
        except Exception as e:
            if not log:
                current_skyline_app_logger = current_skyline_app + 'Log'
                current_logger = logging.getLogger(current_skyline_app_logger)
            current_logger.error(traceback.format_exc())
            current_logger.error(
                'error :: %s :: %s :: failed to get Redis key %s - %s' %
                (current_skyline_app, function_str, redis_key, e))
            raise
        for base_name in unique_base_names:
            try:
                pattern_match, metric_matched_by = matched_or_regexed_in_list(
                    current_skyline_app, base_name, namespaces)
                if pattern_match:
                    filter_by_metrics.append(base_name)
            except Exception as e:
                if not log:
                    current_skyline_app_logger = current_skyline_app + 'Log'
                    current_logger = logging.getLogger(
                        current_skyline_app_logger)
                current_logger.error(traceback.format_exc())
                current_logger.error(
                    'error :: %s :: %s :: failed to get Redis key %s - %s' %
                    (current_skyline_app, function_str, redis_key, e))

    if filter_by_metrics:
        if log:
            current_logger.info('%s :: %s :: filtering on %s metrics' %
                                (current_skyline_app, function_str,
                                 str(len(filter_by_metrics))))

        filtered_boundary_metrics = {}
        for base_name in list(set(filter_by_metrics)):
            boundary_metric_dict = None
            try:
                boundary_metric_dict = boundary_metrics_redis_dict[base_name]
            except:
                continue
            if boundary_metric_dict:
                filtered_boundary_metrics[base_name] = boundary_metric_dict
        if filtered_boundary_metrics:
            boundary_metrics = filtered_boundary_metrics.copy()
            if log:
                current_logger.info(
                    '%s :: %s :: filtered %s boundary_metrics' %
                    (current_skyline_app, function_str,
                     str(len(boundary_metrics))))

    return boundary_metrics
Beispiel #22
0
    def run(self):
        """
        Called when the process intializes.
        """
        def pickle_data_to_graphite(data):

            message = None
            try:
                payload = pickle.dumps(data, protocol=2)
                header = struct.pack("!L", len(payload))
                message = header + payload
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: populate_metric_worker :: failed to pickle to send to Graphite'
                )
                return False
            if message:
                try:
                    sock = socket.socket()
                    sock.connect((CARBON_HOST, FLUX_CARBON_PICKLE_PORT))
                    sock.sendall(message)
                    sock.close()
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to send pickle data to Graphite'
                    )
                    return False
            else:
                logger.error(
                    'error :: populate_metric_worker :: failed to pickle metric data into message'
                )
                return False
            return True

        logger.info('populate_metric_worker :: starting worker')

        # Populate API keys and tokens in memcache
        # python-2.x and python3.x handle while 1 and while True differently
        # while 1:
        running = True
        while running:
            # Make sure Redis is up
            redis_up = False
            while not redis_up:
                try:
                    redis_up = self.redis_conn.ping()
                except:
                    logger.error(
                        'populate_metric_worker :: cannot connect to Redis at socket path %s'
                        % (settings.REDIS_SOCKET_PATH))
                    sleep(2)

                    # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings
                    #                      Branch #3262: py3
                    # if settings.REDIS_PASSWORD:
                    #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
                    # else:
                    #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                    self.redis_conn = get_redis_conn(skyline_app)
                    # @added 20191128 - Bug #3266: py3 Redis binary objects not strings
                    #                   Branch #3262: py3
                    self.redis_conn_decoded = get_redis_conn_decoded(
                        skyline_app)

            metricDict = None
            try:
                # Get a metric from the queue with a 1 second timeout, each
                # metric item on the queue is a list e.g.
                # metric_json = [metricName, metricValue, metricTimestamp]
                metricDict = self.q.get(True, 1)
                logger.info('populate_metric_worker :: processing queue item')
            except Empty:
                logger.info(
                    'populate_metric_worker :: queue is empty and timed out, sleeping for 30 seconds'
                )
                sleep(30)
            except NotImplementedError:
                pass
            except KeyboardInterrupt:
                logger.info(
                    'populate_metric_worker :: server has been issued a user signal to terminate - KeyboardInterrupt'
                )
            except SystemExit:
                logger.info(
                    'populate_metric_worker :: server was interrupted - SystemExit'
                )
            except Exception as e:
                logger.error('error :: populate_metric_worker :: %s' %
                             (str(e)))

            if not metricDict:
                continue

            try:
                remote_host_type = str(metricDict['remote_host_type'])
                remote_target = str(metricDict['remote_target'])
                metric = str(metricDict['metric'])
                namespace_prefix = str(metricDict['namespace_prefix'])
                if not namespace_prefix:
                    namespace_prefix = ''
                if namespace_prefix == 'None':
                    namespace_prefix = ''
                key = str(metricDict['key'])
                token = str(metricDict['token'])
                user = str(metricDict['user'])
                password = str(metricDict['password'])
                if metricDict['fetch_resolution_urls'] == 'None':
                    logger.info(
                        'No fetch_resolution_urls declared for %s, nothing to do'
                        % remote_target)
                    continue
                if metricDict['fetch_resolution_urls'] == '()' or metricDict[
                        'fetch_resolution_urls'] == ():
                    logger.info(
                        'No fetch_resolution_urls declared for %s, nothing to do'
                        % remote_target)
                    continue

                fetch_resolution_urls_str = literal_eval(
                    metricDict['fetch_resolution_urls'])
                fetch_resolution_urls = literal_eval(fetch_resolution_urls_str)
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: populate_metric_worker :: failed to read from metricData'
                )

            if LOCAL_DEBUG:
                try:
                    logger.info(
                        'populate_metric_worker :: remote_target from metricData set to %s'
                        % remote_target)
                    logger.info(
                        'populate_metric_worker :: metric from metricData set to %s'
                        % metric)
                    logger.info(
                        'populate_metric_worker :: namespace_prefix from metricData set to %s'
                        % namespace_prefix)
                    logger.info(
                        'populate_metric_worker :: key from metricData set to %s'
                        % key)
                    logger.info(
                        'populate_metric_worker :: token from metricData set to %s'
                        % token)
                    logger.info(
                        'populate_metric_worker :: user from metricData set to %s'
                        % user)
                    logger.info(
                        'populate_metric_worker :: password from metricData set to %s'
                        % password)
                    logger.info(
                        'populate_metric_worker :: fetch_resolution_urls from metricData set to %s'
                        % str(fetch_resolution_urls))
                    if fetch_resolution_urls:
                        for fetch_url in fetch_resolution_urls:
                            logger.info(
                                'populate_metric_worker :: a fetch_url from metricData is set to %s'
                                % str(fetch_url))
                    logger.info(
                        'populate_metric_worker :: metric is set to %s' %
                        metric)
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to read from metricData'
                    )

            # Best effort to de-duplicate the data sent to Graphite
            cache_key = 'flux.last.%s' % metric
            last_flux_timestamp = None
            try:
                # @modified 20191128 - Bug #3266: py3 Redis binary objects not strings
                #                      Branch #3262: py3
                # redis_last_metric_data = self.redis_conn.get(cache_key).decode('utf-8')
                redis_last_metric_data = self.redis_conn_decoded.get(cache_key)
                last_metric_data = literal_eval(redis_last_metric_data)
                last_flux_timestamp = int(last_metric_data[0])
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: populate_metric_worker :: failed to determine last_flux_timestamp from Redis key %s'
                    % cache_key)
                last_flux_timestamp = False
            recent_last_flux_timestamp_present = False
            if last_flux_timestamp:
                now = int(time())
                if (now - last_flux_timestamp) < 600:
                    recent_last_flux_timestamp_present = True
                    # Skyline has the metric so adding it to the vista.fetcher
                    # Redis set
                    redis_set = 'vista.fetcher.unique_metrics'
                    data = str(remote_target)
                    try:
                        self.redis_conn.sadd(redis_set, data)
                        logger.info(
                            'populate_metric_worker :: the last flux update for %s was less than 600 seconds ago, added metric to %s'
                            % (metric, redis_set))
                    except:
                        logger.info(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: failed to add %s to Redis set %s'
                            % (str(data), str(redis_set)))
                    # continue

            if not last_flux_timestamp:
                # Check Graphite does not have the data or determine what the
                # last data Graphite has is
                logger.info(
                    'populate_metric_worker :: no last_flux_timestamp was found in Redis for %s, checking if Graphite has data'
                    % (metric))
                check_graphite_from = [
                    '-50mins', '-6hours', '-24hours', '-7days', '-30days',
                    '-90days'
                ]
                timeseries = []
                for graphite_from in check_graphite_from:
                    if last_flux_timestamp:
                        break
                    logger.info(
                        'populate_metric_worker :: checking %s in Graphite from %s'
                        % (metric, graphite_from))
                    got_data = False
                    try:
                        # We use absolute time so that if there is a lag in mirage the correct
                        # timeseries data is still surfaced relevant to the anomalous datapoint
                        # timestamp
                        if settings.GRAPHITE_PORT != '':
                            url = '%s://%s:%s/%s/?from=%s&target=%s&format=json' % (
                                settings.GRAPHITE_PROTOCOL,
                                settings.GRAPHITE_HOST,
                                str(settings.GRAPHITE_PORT),
                                settings.GRAPHITE_RENDER_URI, graphite_from,
                                metric)
                        else:
                            url = '%s://%s/%s/?from=%s&target=%s&format=json' % (
                                settings.GRAPHITE_PROTOCOL,
                                settings.GRAPHITE_HOST,
                                settings.GRAPHITE_RENDER_URI, graphite_from,
                                metric)
                        logger.info(
                            'populate_metric_worker :: using Graphite URL - %s'
                            % (url))
                        r = requests.get(url)
                        if r.status_code == 200:
                            js = []
                            try:
                                js = r.json()
                            except:
                                logger.info(traceback.format_exc())
                                logger.error(
                                    'error :: populate_metric_worker :: failed to get data from Graphite'
                                )
                                continue
                            if not js:
                                logger.info(
                                    'populate_metric_worker :: %s not present in Graphite from %s'
                                    % (metric, graphite_from))
                                continue
                            got_data = True
                            logger.info(
                                'populate_metric_worker :: %s present in Graphite from %s'
                                % (metric, graphite_from))
                        else:
                            logger.info(
                                'populate_metric_worker :: %s not present in Graphite from %s'
                                % (metric, graphite_from))
                            continue
                    except:
                        logger.info(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: failed to get data from Graphite'
                        )
                        continue

                    datapoints = []
                    if got_data:
                        try:
                            js = r.json()
                            datapoints = js[0]['datapoints']
                            logger.info(
                                'populate_metric_worker :: %s data points are present in the Graphite %s data'
                                % (str(len(datapoints)), str(graphite_from)))
                        except:
                            logger.info(traceback.format_exc())
                            logger.error(
                                'error :: populate_metric_worker :: failed to get data from Graphite'
                            )

                    for datapoint in datapoints:
                        try:
                            value = float(datapoint[0])
                            timestamp = int(datapoint[1])
                            new_datapoint = [timestamp, value]
                            timeseries.append(new_datapoint)
                        except:  # nosec
                            continue
                    last_timestamp_with_data = None
                    for timestamp, value in timeseries[::-1]:
                        has_value = False
                        if value == 0.0:
                            has_value = True
                        if value == 0:
                            has_value = True
                        if value:
                            has_value = True
                        if has_value:
                            last_timestamp_with_data = int(timestamp)
                            datapoint = value
                            break
                    if last_timestamp_with_data:
                        # Here we set this as the missing last_flux_timestamp
                        last_flux_timestamp = last_timestamp_with_data
                        recent_last_flux_timestamp_present = True
                        logger.info(
                            'populate_metric_worker :: %s last timestamp in Graphite from %s is %s, using as last_flux_timestamp'
                            % (metric, str(graphite_from),
                               str(last_flux_timestamp)))

            timeseries = []
            start_populating = int(time())
            datapoints_added_to_timeseries = 0
            datapoints_already_populated = 0
            datapoints_with_no_value = 0
            timestamp = None
            value = None

            # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints
            # And set flux.last key is the returned value from the remote is
            # null so that time series that are mostly null do not keep on
            # getting added to flux populate_metric by Vista
            raw_timeseries = []

            for fetch_url in fetch_resolution_urls:
                # if recent_last_flux_timestamp_present and remote_host_type == 'prometheus':
                # This was for the query query and resample method and not for
                # the query_range query
                if recent_last_flux_timestamp_present and remote_host_type == 'prometheus_query_range_NOT_FOR_GE_11000':
                    try:
                        logger.info(
                            'populate_metric_worker :: recent data so replacing fetch_url %s '
                            % (fetch_url))
                        seconds_to_fetch = int(time()) - last_flux_timestamp
                        minutes_to_fetch = int(seconds_to_fetch / 60) + 2
                        re_mins_to_fetch = '[%sm]' % str(minutes_to_fetch)
                        fetch_url = re.sub(r'\[.*\]', re_mins_to_fetch,
                                           fetch_url)
                        encoded_re_mins_to_fetch = '%%5B%sm%%5D' % str(
                            minutes_to_fetch)
                        fetch_url = re.sub(r'%5B.*%5D',
                                           encoded_re_mins_to_fetch, fetch_url)
                        logger.info(
                            'populate_metric_worker :: replaced fetch_url %s '
                            % (fetch_url))
                    except:
                        logger.info(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: failed to rewrite URL'
                        )

                if recent_last_flux_timestamp_present and remote_host_type == 'prometheus':
                    try:
                        logger.info(
                            'populate_metric_worker :: recent data so replacing fetch_url %s '
                            % (fetch_url))
                        seconds_to_fetch = int(time()) - last_flux_timestamp
                        minutes_to_fetch = int(seconds_to_fetch / 60) + 2
                        re_mins_to_fetch = '[%sm]' % str(minutes_to_fetch)
                        fetch_url = re.sub(r'\[.*\]', re_mins_to_fetch,
                                           fetch_url)
                        encoded_re_mins_to_fetch = '%%5B%sm%%5D' % str(
                            minutes_to_fetch)
                        fetch_url = re.sub(r'%5B.*%5D',
                                           encoded_re_mins_to_fetch, fetch_url)
                        logger.info(
                            'populate_metric_worker :: replaced fetch_url %s '
                            % (fetch_url))
                    except:
                        logger.info(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: failed to rewrite URL'
                        )

                success = False
                try:
                    logger.info(
                        'populate_metric_worker :: getting data from %s' %
                        str(fetch_url))
                    response = requests.get(fetch_url)
                    if response.status_code == 200:
                        success = True
                except:
                    logger.info(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: http status code - %s, reason - %s'
                        % (str(response.status_code), str(response.reason)))
                    logger.error(
                        'error :: populate_metric_worker :: failed to get data from %s'
                        % str(fetch_url))

                if not success:
                    continue

                datapoints = None
                try:
                    js = response.json()
                    if remote_host_type == 'graphite':
                        datapoints = js[0]['datapoints']
                    if remote_host_type == 'prometheus':
                        datapoints = js['data']['result'][0]['values']
                    datapoints_fetched = len(datapoints)
                    logger.info(
                        'populate_metric_worker :: retrieved %s data points from %s'
                        % (str(datapoints_fetched), str(fetch_url)))
                except:
                    logger.info(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to get data from %s'
                        % str(fetch_url))

                # Example
                # datapoints[0]
                # [7.3, 1556817000]
                # Add each data point and timestamp to the timeseries list so
                # they can be sent to Graphite
                if not datapoints:
                    logger.info(
                        'populate_metric_worker :: failed to get any data from %s'
                        % str(fetch_url))
                    continue

                # @added 20191108 - Bug #3312: flux - populate_metric_worker - handle None in datapoints
                valid_datapoints = []
                for datapoint in datapoints:
                    value = None
                    timestamp = None
                    if remote_host_type == 'graphite':
                        # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints
                        raw_timeseries.append([datapoint[1], datapoint[0]])

                        try:
                            raw_value = datapoint[0]
                            if raw_value is None:
                                datapoints_with_no_value += 1
                                continue
                            value = float(datapoint[0])
                            timestamp = int(datapoint[1])
                            valid_datapoints.append([value, timestamp])
                        except:
                            continue
                    if remote_host_type == 'prometheus':
                        # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints
                        raw_timeseries.append([datapoint[0], datapoint[1]])

                        try:
                            raw_value = datapoint[1]
                            if raw_value is None:
                                datapoints_with_no_value += 1
                                continue
                            timestamp = int(datapoint[0])
                            value = float(datapoint[1])
                        except:
                            continue
                        valid_datapoints.append([timestamp, value])
                datapoints = valid_datapoints

                # Order the time series by timestamp as the tuple can shift
                # order resulting in more recent data being added before older
                # data
                datapoints.sort()

                # Determine the timestamp of the current minute to apply
                # VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE
                time_now = int(time())
                current_minute_hour = int(
                    datetime.datetime.utcfromtimestamp(time_now).strftime(
                        '%H'))
                current_minute_minute = int(
                    datetime.datetime.utcfromtimestamp(time_now).strftime(
                        '%M'))
                current_datetime = datetime.datetime.utcfromtimestamp(
                    time_now).replace(hour=current_minute_hour,
                                      minute=current_minute_minute,
                                      second=0,
                                      microsecond=0)
                current_minute_timestamp_start = int(
                    current_datetime.strftime('%s'))
                datapoints_in_current_minute = 0

                last_error = None
                value = None
                timestamp = None
                for datapoint in datapoints:
                    try:
                        if remote_host_type == 'graphite':
                            try:
                                raw_value = datapoint[0]
                                if raw_value is None:
                                    continue
                                value = float(datapoint[0])
                                timestamp = int(datapoint[1])
                            except:
                                continue
                        if remote_host_type == 'prometheus':
                            # timestamp = int(datapoint[0])
                            try:
                                timestamp = int(datapoint[0])
                                value = float(datapoint[1])
                            except:
                                continue
                        submit_data = True
                        if last_flux_timestamp:
                            if timestamp <= last_flux_timestamp:
                                submit_data = False
                                datapoints_already_populated += 1

                        # Here if the timestamp of the data point falls
                        # within the current minute, it is discarded and not
                        # sent to flux, to ensure that high frequency metrics
                        # can have their minutely bins fully populated before
                        # they are submitted to Graphite
                        if settings.VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE:
                            if timestamp >= current_minute_timestamp_start:
                                submit_data = False
                                datapoints_in_current_minute += 1
                        if submit_data:
                            new_datapoint = [timestamp, value]
                            timeseries.append(new_datapoint)
                            datapoints_added_to_timeseries += 1
                    # nosec to exclude from bandit tests
                    except:  # nosec
                        last_error = traceback.format_exc()
                        datapoints_with_no_value += 1
                        continue

                if last_error:
                    logger.error(last_error)
                    logger.error(
                        'error :: populate_metric_worker :: the above is the last_error encountered processing %s'
                        % (str(metric)))
                if datapoints_with_no_value:
                    logger.info(
                        'populate_metric_worker :: %s of the fetched records were discarded as they had value None'
                        % (str(datapoints_with_no_value)))
                if datapoints_in_current_minute:
                    logger.info(
                        'populate_metric_worker :: %s of the fetched records were discarded as they fall within the current minute'
                        % (str(datapoints_in_current_minute)))
                logger.info(
                    'populate_metric_worker :: %s of the fetched data points are older than the last known flux timestamp'
                    % (str(datapoints_already_populated)))
                logger.info(
                    'populate_metric_worker :: added %s data points to the time series to submit to Graphite'
                    % (str(datapoints_added_to_timeseries)))

            end_fecthing = int(time())
            seconds_to_fetch = end_fecthing - start_populating
            if timestamp:
                logger.info(
                    'populate_metric_worker :: last fetched value - %s, timestamp %s'
                    % (str(value), str(timestamp)))
            logger.info(
                'populate_metric_worker :: %s data point fecthed for %s in %s seconds'
                % (str(datapoints_added_to_timeseries), remote_target,
                   str(seconds_to_fetch)))

            # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints
            # And set flux.last key is the returned value from the remote is
            # null so that time series that are mostly null do not keep on
            # getting added to flux populate_metric by Vista
            if not timeseries:
                set_flux_key = False
                try:
                    sorted_raw_timeseries = sorted(raw_timeseries,
                                                   key=lambda x: x[0])
                    last_ts = sorted_raw_timeseries[-1][0]
                    if int(last_ts) > (end_fecthing - 120):
                        if sorted_raw_timeseries[-1][1] is None:
                            set_flux_key = True
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to determine if last value was null'
                    )
                if set_flux_key:
                    try:
                        # Update Redis flux key
                        cache_key = 'flux.last.%s' % metric
                        metric_data = [int(last_ts), None]
                        self.redis_conn.set(cache_key, str(metric_data))
                        logger.info(
                            'populate_metric_worker :: even though no data points so as to not loop round on this metric, set the metric Redis key - %s - %s'
                            % (cache_key, str(metric_data)))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: even though no data points, failed to set Redis key - %s - %s'
                            % (cache_key, str(metric_data)))
                    # Adding to the vista.fetcher.unique_metrics Redis set
                    redis_set = 'vista.fetcher.unique_metrics'
                    data = str(remote_target)
                    try:
                        self.redis_conn.sadd(redis_set, data)
                        logger.info(
                            'populate_metric_worker :: even though no data points, added %s to Redis set %s'
                            % (remote_target, redis_set))
                    except:
                        logger.info(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: even though no data points, failed to add %s to Redis set %s'
                            % (str(data), str(redis_set)))

            if not timeseries:
                logger.info(
                    'populate_metric_worker :: no data in the timeseries list for the time series for %s'
                    % metric)
                continue

            # Order the time series by timestamp as the tuple can shift
            # order resulting in more recent data being added before older
            # data
            timeseries.sort()
            timeseries_length = len(timeseries)

            # Resample
            resample_at = '1Min'
            if resample_at:
                try:
                    df = pd.DataFrame(timeseries)
                    df.columns = ['timestamp', 'value']
                    df['timestamp'] = pd.to_datetime(df['timestamp'],
                                                     unit='s',
                                                     origin='unix')
                    df = df.set_index('timestamp')
                    # resampled_df = df.resample(resample_at).sum()
                    # Use the mean as Prometheus uses the average in the
                    # query_range API method
                    resampled_df = df.resample(resample_at).mean()
                    resampled_timeseries = []
                    for index, row in resampled_df.iterrows():
                        timestamp = int(index.strftime('%s'))
                        resampled_timeseries.append([timestamp, row[0]])
                    timeseries = resampled_timeseries
                    timeseries_length = len(timeseries)
                    logger.info(
                        'populate_metric_worker :: time series resampled at %s resulting in %s data points to send to Graphite'
                        % (str(resample_at), str(timeseries_length)))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to resample time series for %s'
                        % str(metric))

            logger.info(
                'populate_metric_worker :: %s data points to send to Graphite'
                % (str(timeseries_length)))
            timestamp = None
            value = None
            sent_to_graphite = 0

            # use_pickle = False
            use_pickle = True
            if not use_pickle:
                for timestamp, value in timeseries:
                    try:
                        graphyte.send(metric, float(value), int(timestamp))
                        sent_to_graphite += 1
                        if sent_to_graphite % 1000 == 0:
                            logger.info(
                                'populate_metric_worker :: submitted %s of %s data points to Graphite so far'
                                % (str(sent_to_graphite),
                                   str(timeseries_length)))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: failed to send metric data to Graphite for %s'
                            % str(metric))
            else:
                listOfMetricTuples = []
                try:
                    for timestamp, value in timeseries:
                        tuple_data = (metric, (int(timestamp), float(value)))
                        listOfMetricTuples.append(tuple_data)
                        sent_to_graphite += 1
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to populate listOfMetricTuples for %s'
                        % str(metric))
                if listOfMetricTuples:
                    data_points_sent = 0
                    smallListOfMetricTuples = []
                    tuples_added = 0
                    for data in listOfMetricTuples:
                        smallListOfMetricTuples.append(data)
                        tuples_added += 1
                        if tuples_added >= 1000:
                            pickle_data_sent = pickle_data_to_graphite(
                                smallListOfMetricTuples)
                            if pickle_data_sent:
                                data_points_sent += tuples_added
                                logger.info(
                                    'populate_metric_worker :: sent %s/%s of %s data points to Graphite via pickle for %s'
                                    %
                                    (str(tuples_added), str(data_points_sent),
                                     str(timeseries_length), metric))
                                sent_to_graphite += len(
                                    smallListOfMetricTuples)
                                smallListOfMetricTuples = []
                                tuples_added = 0
                            else:
                                logger.error(
                                    'error :: populate_metric_worker :: failed to send %s data points to Graphite via pickle for %s'
                                    % (str(tuples_added), metric))
                    if smallListOfMetricTuples:
                        tuples_to_send = len(smallListOfMetricTuples)
                        pickle_data_sent = pickle_data_to_graphite(
                            smallListOfMetricTuples)
                        if pickle_data_sent:
                            data_points_sent += tuples_to_send
                            logger.info(
                                'populate_metric_worker :: sent the last %s/%s of %s data points to Graphite via pickle for %s'
                                % (str(tuples_to_send), str(data_points_sent),
                                   str(timeseries_length), metric))
                        else:
                            logger.error(
                                'error :: populate_metric_worker :: failed to send the last %s data points to Graphite via pickle for %s'
                                % (str(tuples_to_send), metric))

            logger.info(
                'populate_metric_worker :: sent %s data points to Graphite for %s'
                % (str(sent_to_graphite), metric))
            try:
                skyline_metric = '%s.datapoints_sent_to_graphite' % (
                    skyline_app_graphite_namespace)
                # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host
                # graphyte.send(skyline_metric, float(sent_to_graphite), int(time()))
                send_graphite_metric(skyline_app, skyline_metric,
                                     float(sent_to_graphite))
                logger.info(
                    'populate_metric_worker :: submitted %s to Graphite for %s'
                    % (str(float(sent_to_graphite)), skyline_metric))
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: populate_metric_worker :: failed to send metric data to Graphite for %s'
                    % str(skyline_metric))

            has_value = False
            if value == 0.0:
                has_value = True
            if value == 0:
                has_value = True
            if value:
                has_value = True

            if timestamp and has_value:
                try:
                    # Update Redis flux key
                    cache_key = 'flux.last.%s' % metric
                    metric_data = [int(timestamp), float(value)]
                    self.redis_conn.set(cache_key, str(metric_data))
                    logger.info(
                        'populate_metric_worker :: set the metric Redis key - %s - %s'
                        % (cache_key, str(metric_data)))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to set Redis key - %s - %s'
                        % (cache_key, str(metric_data)))

                # Adding to the vista.fetcher.unique_metrics Redis set
                redis_set = 'vista.fetcher.unique_metrics'
                data = str(remote_target)
                try:
                    self.redis_conn.sadd(redis_set, data)
                    logger.info(
                        'populate_metric_worker :: added %s to Redis set %s' %
                        (remote_target, redis_set))
                except:
                    logger.info(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to add %s to Redis set %s'
                        % (str(data), str(redis_set)))

            end_populating = int(time())
            seconds_to_run = end_populating - start_populating
            logger.info(
                'populate_metric_worker :: %s populated to Graphite in %s seconds'
                % (metric, str(seconds_to_run)))
Beispiel #23
0
def alert_smtp(datapoint, metric_name, expiration_time, metric_trigger,
               algorithm):

    sender = settings.BOUNDARY_SMTP_OPTS['sender']

    matched_namespaces = []
    for namespace in settings.BOUNDARY_SMTP_OPTS['recipients']:
        CHECK_MATCH_PATTERN = namespace
        check_match_pattern = re.compile(CHECK_MATCH_PATTERN)
        pattern_match = check_match_pattern.match(metric_name)
        if pattern_match:
            matched_namespaces.append(namespace)
    matched_recipients = []
    for namespace in matched_namespaces:
        for recipients in settings.BOUNDARY_SMTP_OPTS['recipients'][namespace]:
            matched_recipients.append(recipients)

    def unique_noHash(seq):
        seen = set()
        return [x for x in seq if str(x) not in seen and not seen.add(str(x))]

    recipients = unique_noHash(matched_recipients)

    # Backwards compatibility
    if type(recipients) is str:
        recipients = [recipients]

    # @added 20180524 - Task #2384: Change alerters to cc other recipients
    # The alerters did send an individual email to each recipient. This would be
    # more useful if one email was sent with the first smtp recipient being the
    # to recipient and the subsequent recipients were add in cc.
    primary_recipient = False
    cc_recipients = False
    if recipients:
        for i_recipient in recipients:
            if not primary_recipient:
                primary_recipient = str(i_recipient)
            if primary_recipient != i_recipient:
                if not cc_recipients:
                    cc_recipients = str(i_recipient)
                else:
                    new_cc_recipients = '%s,%s' % (str(cc_recipients),
                                                   str(i_recipient))
                    cc_recipients = str(new_cc_recipients)
        logger.info(
            'alert_smtp - will send to primary_recipient :: %s, cc_recipients :: %s'
            % (str(primary_recipient), str(cc_recipients)))

    alert_algo = str(algorithm)
    alert_context = alert_algo.upper()

    # @added 20191008 - Feature #3194: Add CUSTOM_ALERT_OPTS to settings
    try:
        main_alert_title = settings.CUSTOM_ALERT_OPTS['main_alert_title']
    except:
        main_alert_title = 'Skyline'
    try:
        app_alert_context = settings.CUSTOM_ALERT_OPTS[
            'boundary_alert_heading']
    except:
        app_alert_context = 'Boundary'

    # @modified 20191002 - Feature #3194: Add CUSTOM_ALERT_OPTS to settings
    # Use alert_context
    # unencoded_graph_title = 'Skyline Boundary - %s at %s hours - %s - %s' % (
    #     alert_context, graphite_previous_hours, metric_name, datapoint)
    unencoded_graph_title = '%s %s - %s at %s hours - %s - %s' % (
        main_alert_title, app_alert_context, alert_context,
        graphite_previous_hours, metric_name, datapoint)

    # @added 20181126 - Task #2742: Update Boundary
    #                   Feature #2034: analyse_derivatives
    # Added deriative functions to convert the values of metrics strictly
    # increasing monotonically to their deriative products in alert graphs and
    # specify it in the graph_title
    known_derivative_metric = False
    try:
        # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow
        # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings
        #                      Branch #3262: py3
        # Use get_redis_conn_decoded
        # if settings.REDIS_PASSWORD:
        #     # @modified 20191022 - Bug #3266: py3 Redis binary objects not strings
        #     #                      Branch #3262: py3
        #     # REDIS_ALERTER_CONN = redis.StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
        #     REDIS_ALERTER_CONN = redis.StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH, charset='utf-8', decode_responses=True)
        # else:
        #     # REDIS_ALERTER_CONN = redis.StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
        #     REDIS_ALERTER_CONN = redis.StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH, charset='utf-8', decode_responses=True)
        REDIS_ALERTER_CONN = get_redis_conn_decoded(skyline_app)
    except:
        logger.error('error :: alert_smtp - redis connection failed')

    # @modified 20191022 - Bug #3266: py3 Redis binary objects not strings
    #                      Branch #3262: py3
    try:
        derivative_metrics = list(
            REDIS_ALERTER_CONN.smembers('derivative_metrics'))
    except:
        derivative_metrics = []
    redis_metric_name = '%s%s' % (settings.FULL_NAMESPACE, str(metric_name))
    if redis_metric_name in derivative_metrics:
        known_derivative_metric = True
    if known_derivative_metric:
        try:
            non_derivative_monotonic_metrics = settings.NON_DERIVATIVE_MONOTONIC_METRICS
        except:
            non_derivative_monotonic_metrics = []
        skip_derivative = in_list(redis_metric_name,
                                  non_derivative_monotonic_metrics)
        if skip_derivative:
            known_derivative_metric = False

    known_derivative_metric = is_derivative_metric(skyline_app, metric_name)

    if known_derivative_metric:
        # @modified 20191002 - Feature #3194: Add CUSTOM_ALERT_OPTS to settings
        # unencoded_graph_title = 'Skyline Boundary - %s at %s hours - derivative graph - %s - %s' % (
        #     alert_context, graphite_previous_hours, metric_name, datapoint)
        unencoded_graph_title = '%s %s - %s at %s hours - derivative graph - %s - %s' % (
            main_alert_title, app_alert_context, alert_context,
            graphite_previous_hours, metric_name, datapoint)

    graph_title_string = quote(unencoded_graph_title, safe='')
    graph_title = '&title=%s' % graph_title_string

    # @added 20181126 - Bug #2498: Incorrect scale in some graphs
    #                   Task #2742: Update Boundary
    # If -xhours is used the scale is incorrect if x hours > than first
    # retention period, passing from and until renders the graph with the
    # correct scale.
    graphite_port = '80'
    if settings.GRAPHITE_PORT != '':
        graphite_port = str(settings.GRAPHITE_PORT)
    until_timestamp = int(time())
    from_seconds_ago = graphite_previous_hours * 3600
    from_timestamp = until_timestamp - from_seconds_ago
    graphite_from = dt.datetime.fromtimestamp(
        int(from_timestamp)).strftime('%H:%M_%Y%m%d')
    logger.info('graphite_from - %s' % str(graphite_from))
    graphite_until = dt.datetime.fromtimestamp(
        int(until_timestamp)).strftime('%H:%M_%Y%m%d')
    logger.info('graphite_until - %s' % str(graphite_until))
    # @modified 20191022 - Task #3294: py3 - handle system parameter in Graphite cactiStyle
    # graphite_target = 'target=cactiStyle(%s)'
    graphite_target = 'target=cactiStyle(%s,%%27si%%27)' % metric_name
    if known_derivative_metric:
        # @modified 20191022 - Task #3294: py3 - handle system parameter in Graphite cactiStyle
        # graphite_target = 'target=cactiStyle(nonNegativeDerivative(%s))'
        graphite_target = 'target=cactiStyle(nonNegativeDerivative(%s),%%27si%%27)' % metric_name
    # @modified 20190520 - Branch #3002: docker
    # Use GRAPHITE_RENDER_URI
    # link = '%s://%s:%s/render/?from=%s&until=%s&%s%s%s&colorList=%s' % (
    #     settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, graphite_port,
    #     str(graphite_from), str(graphite_until), graphite_target,
    #     settings.GRAPHITE_GRAPH_SETTINGS, graph_title,
    #     graphite_graph_line_color)
    link = '%s://%s:%s/%s/?from=%s&until=%s&%s%s%s&colorList=%s' % (
        settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST,
        graphite_port, settings.GRAPHITE_RENDER_URI, str(graphite_from),
        str(graphite_until), graphite_target, settings.GRAPHITE_GRAPH_SETTINGS,
        graph_title, graphite_graph_line_color)

    content_id = metric_name
    image_data = None

    image_file = '%s/%s.%s.%s.alert_smtp.png' % (
        settings.SKYLINE_TMP_DIR, skyline_app, str(until_timestamp),
        metric_name)
    if settings.BOUNDARY_SMTP_OPTS.get('embed-images'):
        image_data = get_graphite_graph_image(skyline_app, link, image_file)

    if settings.BOUNDARY_SMTP_OPTS.get('embed-images_disabled3290'):
        # @modified 20191021 - Task #3290: Handle urllib2 in py3
        #                      Branch #3262: py3
        if python_version == 2:
            try:
                # @modified 20170913 - Task #2160: Test skyline with bandit
                # Added nosec to exclude from bandit tests
                # image_data = urllib2.urlopen(link).read()  # nosec
                image_data = None
            except urllib2.URLError:
                image_data = None
        if python_version == 3:
            try:
                # image_data = urllib.request.urlopen(link).read()  # nosec
                image_data = None
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: boundary_alerters :: alert_smtp :: failed to urlopen %s'
                    % str(link))
                image_data = None

    # If we failed to get the image or if it was explicitly disabled,
    # use the image URL instead of the content.
    if image_data is None:
        img_tag = '<img src="%s"/>' % link
    else:
        img_tag = '<img src="cid:%s"/>' % content_id

    # @modified 20191002 - Feature #3194: Add CUSTOM_ALERT_OPTS to settings
    # body = '%s :: %s <br> Next alert in: %s seconds <br> skyline Boundary alert - %s <br><a href="%s">%s</a>' % (
    #     datapoint, metric_name, expiration_time, alert_context, link, img_tag)
    body = '%s :: %s <br> Next alert in: %s seconds <br> %s %s alert - %s <br><a href="%s">%s</a>' % (
        main_alert_title, app_alert_context, datapoint, metric_name,
        expiration_time, alert_context, link, img_tag)

    # @modified 20180524 - Task #2384: Change alerters to cc other recipients
    # Do not send to each recipient, send to primary_recipient and cc the other
    # recipients, thereby sending only one email
    # for recipient in recipients:
    if primary_recipient:
        logger.info(
            'alert_smtp - will send to primary_recipient :: %s, cc_recipients :: %s'
            % (str(primary_recipient), str(cc_recipients)))

        msg = MIMEMultipart('alternative')
        # @modified 20191002 - Feature #3194: Add CUSTOM_ALERT_OPTS to settings
        # msg['Subject'] = '[Skyline alert] ' + 'Boundary ALERT - ' + alert_context + ' - ' + datapoint + ' - ' + metric_name
        msg['Subject'] = '[' + main_alert_title + ' alert] ' + app_alert_context + ' ALERT - ' + alert_context + ' - ' + datapoint + ' - ' + metric_name
        msg['From'] = sender
        # @modified 20180524 - Task #2384: Change alerters to cc other recipients
        # msg['To'] = recipient
        msg['To'] = primary_recipient

        # @added 20180524 - Task #2384: Change alerters to cc other recipients
        # Added Cc
        if cc_recipients:
            msg['Cc'] = cc_recipients

        msg.attach(MIMEText(body, 'html'))
        if image_data is not None:

            # msg_attachment = MIMEImage(image_data)
            fp = open(image_file, 'rb')
            msg_attachment = MIMEImage(fp.read())
            fp.close()

            msg_attachment.add_header('Content-ID', '<%s>' % content_id)
            msg.attach(msg_attachment)

        s = SMTP('127.0.0.1')
        # @modified 20180524 - Task #2384: Change alerters to cc other recipients
        # Send to primary_recipient and cc_recipients
        # s.sendmail(sender, recipient, msg.as_string())
        try:
            if cc_recipients:
                s.sendmail(sender, [primary_recipient, cc_recipients],
                           msg.as_string())
            else:
                s.sendmail(sender, primary_recipient, msg.as_string())
        except:
            logger.error(traceback.format_exc())
            logger.error(
                'error :: alert_smtp - could not send email to primary_recipient :: %s, cc_recipients :: %s'
                % (str(primary_recipient), str(cc_recipients)))
        s.quit()
Beispiel #24
0
    def run(self):
        """
        - Called when the process intializes.

        - Determine if Redis is up

        - Spawn a rolling process to do checks

        - Wait for the process to finish.

        - run_every 60 seconds
        """

        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        now = time()
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = time()
            else:
                now = log_wait_for + 1

        logger.info('thunder/rolling :: starting %s/rolling' % skyline_app)

        try:
            SERVER_METRIC_PATH = '.%s' % settings.SERVER_METRICS_NAME
            if SERVER_METRIC_PATH == '.':
                SERVER_METRIC_PATH = ''
        except Exception as e:
            SERVER_METRIC_PATH = ''
            logger.warning(
                'warning :: thunder/rolling :: settings.SERVER_METRICS_NAME is not declared in settings.py, defaults to \'\' - %s'
                % e)

        run_every = 60

        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except Exception as e:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: thunder/rolling cannot connect to redis at socket path %s - %s'
                    % (settings.REDIS_SOCKET_PATH, e))
                sleep(10)
                try:
                    self.redis_conn = get_redis_conn(skyline_app)
                    self.redis_conn_decoded = get_redis_conn_decoded(
                        skyline_app)
                except Exception as e:
                    logger.info(traceback.format_exc())
                    logger.error(
                        'error :: thunder/rolling cannot connect to get_redis_conn - %s'
                        % e)
                continue

            # Report app up
            try:
                self.redis_conn.setex('thunder.rolling', 120, now)
            except Exception as e:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: thunder/rolling :: could not update the Redis analyzer.thunder/rolling key - %s'
                    % e)

            # Spawn processes
            pids = []
            spawned_pids = []
            pid_count = 0
            try:
                p = Process(target=self.rolling_process, args=(0, ))
                pids.append(p)
                pid_count += 1
                logger.info('thunder/rolling :: starting rolling_process')
                p.start()
                spawned_pids.append(p.pid)
            except Exception as e:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: thunder/rolling :: failed to spawn process - %s'
                    % e)

            # Self monitor processes and terminate if any rolling_process that
            # has run for longer than 180 seconds
            p_starts = time()
            while time() - p_starts <= run_every:
                if any(p.is_alive() for p in pids):
                    # Just to avoid hogging the CPU
                    sleep(.1)
                else:
                    # All the processes are done, break now.
                    time_to_run = time() - p_starts
                    logger.info(
                        'thunder/rolling :: rolling_process completed in %.2f seconds'
                        % (time_to_run))
                    break
            else:
                # We only enter this if we didn't 'break' above.
                logger.info(
                    'thunder/rolling :: timed out, killing rolling_process process'
                )
                for p in pids:
                    logger.info(
                        'thunder/rolling :: killing rolling_process process')
                    p.terminate()
                    logger.info(
                        'thunder/rolling :: killed rolling_process process')

            for p in pids:
                if p.is_alive():
                    try:
                        logger.info(
                            'thunder/rolling :: stopping rolling_process - %s'
                            % (str(p.is_alive())))
                        p.terminate()
                    except Exception as e:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: thunder/rolling :: failed to stop rolling_process - %s'
                            % e)

            process_runtime = time() - now
            if process_runtime < run_every:
                sleep_for = (run_every - process_runtime)

                process_runtime_now = time() - now
                sleep_for = (run_every - process_runtime_now)

                logger.info(
                    'thunder/rolling :: sleeping for %.2f seconds due to low run time...'
                    % sleep_for)
                sleep(sleep_for)
                try:
                    del sleep_for
                except Exception as e:
                    logger.error(
                        'error :: thunder/rolling :: failed to del sleep_for - %s'
                        % e)
            try:
                del process_runtime
            except Exception as e:
                logger.error(
                    'error :: thunder/rolling :: failed to del process_runtime - %s'
                    % e)
Beispiel #25
0
def prune_metrics_timestamp_hash_key(current_skyline_app,
                                     hash_key,
                                     older_than_timestamp,
                                     log=True):
    """
    Remove any entries from a metrics timestamp hash key older than the
    timestamp passed.

    :param current_skyline_app: the app calling the function
    :param hash_key: the metric:timestamp style Redis hash key
    :param older_than_timestamp: the unix timestamp
    :param log: whether to log or not, optional, defaults to True
    :type current_skyline_app: str
    :type hash_key: str
    :type timestamp: int
    :type log: boolean
    :return: removed_count
    :rtype: int

    """

    removed_from_hash = 0
    function_str = 'metrics_manager :: functions.redis.prune_metrics_timestamp_hash_key'
    if log:
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
    else:
        current_logger = None

    if log:
        current_logger.info(
            '%s :: pruning entries older than %s from Redis hash key %s' %
            (function_str, str(older_than_timestamp), hash_key))

    try:
        redis_conn_decoded = get_redis_conn_decoded(current_skyline_app)
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: failed to get Redis connection - %s' %
            (function_str, e))
        return removed_from_hash

    metrics_dict = {}
    try:
        metrics_dict = redis_conn_decoded.hgetall(hash_key)
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: failed to get Redis hash key %s - %s' %
            (function_str, hash_key, e))

    if not metrics_dict:
        return removed_from_hash

    # Remove entries older_than_timestamp
    metrics = list(metrics_dict.keys())
    for metric in metrics:
        try:
            timestamp = float(metrics_dict[metric])
            if int(timestamp) < older_than_timestamp:
                try:
                    redis_conn_decoded.hdel(hash_key, metric)
                    removed_from_hash += 1
                except Exception as e:
                    if not log:
                        current_skyline_app_logger = current_skyline_app + 'Log'
                        current_logger = logging.getLogger(
                            current_skyline_app_logger)
                    current_logger.error(traceback.format_exc())
                    current_logger.error(
                        'error :: %s :: failed to del %s from Redis hash key %s - %s'
                        % (function_str, metric, hash_key, e))
        except Exception as e:
            if not log:
                current_skyline_app_logger = current_skyline_app + 'Log'
                current_logger = logging.getLogger(current_skyline_app_logger)
            current_logger.error(traceback.format_exc())
            current_logger.error(
                'error :: %s :: manage %s from Redis hash key %s, breaking out of loop - %s'
                % (function_str, metric, hash_key, e))
            break

    if log:
        current_logger.info(
            '%s :: removed %s old entries from Redis hash key %s' %
            (function_str, str(removed_from_hash), hash_key))

    return removed_from_hash
Beispiel #26
0
def get_top_level_namespaces(current_skyline_app, log=False):
    """
    Determine all top level parent namespaces and return the list.

    :param current_skyline_app: the app calling the function
    :param log: whether to log or not, optional, defaults to False
    :type current_skyline_app: str
    :type log: boolean
    :return: top_level_namespaces
    :rtype: list

    """

    top_level_namespaces = []
    function_str = 'functions.metrics.get_top_level_namespaces'
    if log:
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.info('%s :: %s :: determining top level namespaces' %
                            (current_skyline_app, function_str))
    else:
        current_logger = None

    try:
        redis_conn_decoded = get_redis_conn_decoded(current_skyline_app)
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: %s :: get_redis_conn_decoded failed - %s' %
            (current_skyline_app, function_str, e))
        return top_level_namespaces

    unique_base_names = []
    redis_key = 'aet.analyzer.unique_base_names'
    try:
        unique_base_names = list(redis_conn_decoded.smembers(redis_key))
        if unique_base_names:
            if log:
                current_logger.info('%s :: %s :: got %s unique_base_names' %
                                    (current_skyline_app, function_str,
                                     str(len(unique_base_names))))
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: %s :: failed to get Redis key %s - %s' %
            (current_skyline_app, function_str, redis_key, e))
    for base_name in unique_base_names:
        top_level_namespace = base_name.split('.')[0]
        if top_level_namespace:
            top_level_namespaces.append(top_level_namespace)
    if top_level_namespaces:
        top_level_namespaces = list(set(top_level_namespaces))
        if log:
            current_logger.info(
                '%s :: %s :: returning %s top level namespaces' %
                (current_skyline_app, function_str,
                 str(len(top_level_namespaces))))

    return top_level_namespaces
Beispiel #27
0
    def run(self):
        """
        Called when the process intializes.
        """

        logger.info('worker :: starting worker')

        last_sent_to_graphite = int(time())
        metrics_sent_to_graphite = 0

        # Populate API keys and tokens in memcache
        # python-2.x and python3.x handle while 1 and while True differently
        # while 1:
        running = True
        while running:
            # Make sure Redis is up
            redis_up = False
            while not redis_up:
                try:
                    redis_up = self.redis_conn.ping()
                except:
                    logger.error(
                        'worker :: cannot connect to redis at socket path %s' %
                        (settings.REDIS_SOCKET_PATH))
                    sleep(2)
                    # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings
                    #                      Branch #3262: py3
                    # Use get_redis_conn and get_redis_conn_decoded
                    # if settings.REDIS_PASSWORD:
                    #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
                    # else:
                    #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                    self.redis_conn = get_redis_conn(skyline_app)
                    self.redis_conn_decoded = get_redis_conn_decoded(
                        skyline_app)

            if LOCAL_DEBUG:
                try:
                    metric_data_queue_size = self.q.qsize()
                    logger.info(
                        'worker :: debug :: flux.httpMetricDataQueue queue size - %s'
                        % str(metric_data_queue_size))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to determine size of queue flux.httpMetricDataQueue'
                    )

            metric_data = None
            try:
                # Get a metric from the queue with a 1 second timeout, each
                # metric item on the queue is a list e.g.
                # metric_data = [metricName, metricValue, metricTimestamp]
                metric_data = self.q.get(True, 1)

            except Empty:
                logger.info('worker :: queue is empty and timed out')
                sleep(1)
            except NotImplementedError:
                pass
            except KeyboardInterrupt:
                logger.info(
                    'worker :: server has been issued a user signal to terminate - KeyboardInterrupt'
                )
            except SystemExit:
                logger.info('worker :: server was interrupted - SystemExit')
            except Exception as e:
                logger.error('error :: worker :: %s' % (str(e)))

            # @added 20200206 - Feature #3444: Allow flux to backfill
            # Added backfill
            backfill = False

            if metric_data:
                try:
                    metric = str(metric_data[0])
                    value = float(metric_data[1])
                    timestamp = int(metric_data[2])
                    # @added 20200206 - Feature #3444: Allow flux to backfill
                    # Added backfill
                    backfill = int(metric_data[3])
                    if LOCAL_DEBUG:
                        logger.info(
                            'worker :: debug :: queue item found - %s' %
                            str(metric_data))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to interpolate metric, value, timestamp from metric_data - %s'
                        % str(metric_data))
                    continue

                if settings.FLUX_SEND_TO_CARBON:
                    # Best effort de-duplicate the data
                    valid_data = True

                    # @added 20200818 - Feature #3694: flux - POST multiple metrics
                    # Handle Redis and literal_eval separately
                    redis_last_metric_data = None

                    # @modified 20200206 - Feature #3444: Allow flux to backfill
                    # Only check flux.last key if this is not backfill
                    if not backfill:
                        cache_key = 'flux.last.%s' % metric
                        last_metric_timestamp = None
                        try:
                            # @modified 20191128 - Bug #3266: py3 Redis binary objects not strings
                            #                      Branch #3262: py3
                            # redis_last_metric_data = self.redis_conn.get(cache_key)
                            redis_last_metric_data = self.redis_conn_decoded.get(
                                cache_key)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to determine last_metric_timestamp from Redis key %s'
                                % str(cache_key))
                            redis_last_metric_data = None

                        # @modified 20200818 - Feature #3694: flux - POST multiple metrics
                        # Handle Redis and literal_eval separately, only
                        # literal_eval if Redis had data for the key
                        if redis_last_metric_data:
                            try:
                                last_metric_data = literal_eval(
                                    redis_last_metric_data)
                                last_metric_timestamp = int(
                                    last_metric_data[0])
                                if LOCAL_DEBUG:
                                    logger.info(
                                        'worker :: debug :: last_metric_timestamp for %s from %s is %s'
                                        % (metric, str(cache_key),
                                           str(last_metric_timestamp)))
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: worker :: failed to determine last_metric_timestamp from Redis key %s'
                                    % str(cache_key))
                                last_metric_timestamp = False

                        if last_metric_timestamp:
                            if timestamp <= last_metric_timestamp:
                                valid_data = False
                                if LOCAL_DEBUG:
                                    logger.info(
                                        'worker :: debug :: not valid data - the queue data timestamp %s is <= to the last_metric_timestamp %s for %s'
                                        % (str(timestamp),
                                           str(last_metric_timestamp), metric))

                    if valid_data:
                        submittedToGraphite = False
                        try:
                            graphyte.send(metric, value, timestamp)
                            submittedToGraphite = True
                            logger.info(
                                'worker :: sent %s, %s, %s to Graphite' %
                                (str(metric), str(value), str(timestamp)))
                            metrics_sent_to_graphite += 1
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to send metric data to Graphite for %s'
                                % str(metric))
                            metric = None
                        if submittedToGraphite:
                            # Update the metric Redis flux key
                            # @modified 20200206 - Feature #3444: Allow flux to backfill
                            # Only update the flux.last key if this is not backfill
                            if not backfill:
                                metric_data = [timestamp, value]
                                self.redis_conn.set(cache_key,
                                                    str(metric_data))
                            # @added 20200213 - Bug #3448: Repeated airgapped_metrics
                            else:
                                # @added 20200213 - Bug #3448: Repeated airgapped_metrics
                                # Add a flux.filled key to Redis with a expiry
                                # set to FULL_DURATION so that Analyzer knows to
                                # sort and deduplicate the Redis time series
                                # data as carbon-relay will send it to Horizon
                                # and the datapoints will be out of order in the
                                # Redis key
                                try:
                                    flux_filled_key = 'flux.filled.%s' % str(
                                        metric)
                                    self.redis_conn.setex(
                                        flux_filled_key,
                                        settings.FULL_DURATION, int(time()))
                                    logger.info('worker :: set Redis key %s' %
                                                (str(flux_filled_key)))
                                except Exception as e:
                                    logger.error(
                                        'error :: failed to could not set Redis flux.filled key: %s'
                                        % e)
                    else:
                        logger.info(
                            'worker :: discarded %s, %s, %s as a data point for %s has already been submitted to Graphite'
                            % (str(metric), str(value), str(timestamp),
                               str(timestamp)))
                else:
                    logger.info(
                        'worker :: settings.FLUX_SEND_TO_CARBON is set to %s, discarded %s, %s, %s'
                        % (str(settings.FLUX_SEND_TO_CARBON), str(metric),
                           str(value), str(timestamp)))

                if settings.FLUX_SEND_TO_STATSD:
                    statsd_conn.incr(metric, value, timestamp)
                    logger.info('worker sent %s, %s, %s to statsd' %
                                (metric, str(value), str(timestamp)))

            time_now = int(time())
            if (time_now - last_sent_to_graphite) >= 60:
                logger.info(
                    'worker :: metrics_sent_to_graphite in last 60 seconds - %s'
                    % str(metrics_sent_to_graphite))
                skyline_metric = '%s.metrics_sent_to_graphite' % skyline_app_graphite_namespace
                try:
                    # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host
                    # graphyte.send(skyline_metric, metrics_sent_to_graphite, time_now)
                    send_graphite_metric(skyline_app, skyline_metric,
                                         metrics_sent_to_graphite)
                    last_sent_to_graphite = int(time())
                    metrics_sent_to_graphite = 0
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to send_graphite_metric %s with %s'
                        % (skyline_metric, str(metrics_sent_to_graphite)))
Beispiel #28
0
def get_metrics_timeseries(
        current_skyline_app, metrics_functions, from_timestamp,
        until_timestamp, log=True):
    """
    Return dictionary of metrics with their timeseries as a list e.g.

    metrics_timeseries = {
        'metric.1': {
            'timeseries': [[ts, value], [ts, value], ..., [ts, value]],
            'functions': 'nonNegativeDerivative',
        },
        'metric.2': {
            'timeseries': [[ts, value], [ts, value], ..., [ts, value]],
            'functions': None,
        },
        'metric.3': {
            'timeseries': [[ts, value], [ts, value], ..., [ts, value]],
            'functions': {'summarise': {'intervalString': '10min', 'func': 'sum'}, 'integral': None},
        },
    }

    The metrics_functions parameter dictionary allows for metrics and any
    functions to be applied to be specified e.g.

    metrics_functions = {
        'metric.1': {
            'functions': None,
        },
        'metric.2': {
            'functions': None,
        },
        'metric.3': {
            'functions': {'integral': None, 'summarize': {'intervalString': '10min', 'func': 'sum'}},
        },
    }

    Each metric can have one or multiple functions parsed for it using the
    functions key in the dictionary item.  There is NO NEED to ever pass the
    nonNegativeDerivative as the function uses the normal derivative_metrics
    information to do that.
    functions are applied in the order in which they are passed e.g.

    target=integral(summarize(metric.3,"10min"))

    function parameters can be passed with the function as well or declared as
    None if there are no parameters required with the function.

    :param current_skyline_app: the app calling the function
    :param metrics_functions: the metric base_names and any functions to apply
    :param from_timestamp: the from unix timestamp
    :param until_timestamp: the until unix timestamp
    :param log: whether to log or not, optional, defaults to True
    :type current_skyline_app: str
    :type metrics_functions: dict
    :type log: boolean
    :return: dictionary of metric timeseries
    :rtype: dict

    """

    metrics_timeseries = {}

    function_str = '%s :: functions.graphite.get_metrics_timeseries' % current_skyline_app
    if log:
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
    else:
        current_logger = None

    # graphite URL
    graphite_port = '80'
    if settings.GRAPHITE_PORT != '':
        graphite_port = str(settings.GRAPHITE_PORT)
    if settings.GRAPHITE_PORT == '443' and settings.GRAPHITE_PROTOCOL == 'https':
        graphite_port = ''
    graphite_url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + ':' + graphite_port + '/' + settings.GRAPHITE_RENDER_URI + '?from=' + str(from_timestamp) + '&until=' + str(until_timestamp) + '&format=json'
    connect_timeout = int(settings.GRAPHITE_CONNECT_TIMEOUT)
    read_timeout = int(settings.GRAPHITE_READ_TIMEOUT)
    read_timeout = 30
    use_timeout = (int(connect_timeout), int(read_timeout))

    try:
        redis_conn_decoded = get_redis_conn_decoded(current_skyline_app)
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error('error :: %s :: failed to connect to Redis - %s' % (
            function_str, e))

    derivative_metrics = []
    try:
        # @modified 20211012 - Feature #4280: aet.metrics_manager.derivative_metrics Redis hash
        # derivative_metrics = list(redis_conn_decoded.smembers('derivative_metrics'))
        derivative_metrics = list(redis_conn_decoded.smembers('aet.metrics_manager.derivative_metrics'))
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error('error :: %s :: failed to connect to Redis for smembers of derivative_metrics - %s' % (
            function_str, e))
        derivative_metrics = []

    # Add nonNegativeDerivative tranform to derivative_metrics and then fetch
    # from Graphite in batches of MAX_GRAPHITE_TARGETS
    get_metrics_with_functions = {}
    for base_name in list(metrics_functions.keys()):
        redis_metric_name = '%s%s' % (settings.FULL_NAMESPACE, base_name)
        if redis_metric_name in derivative_metrics:
            get_metrics_with_functions[base_name] = metrics_functions[base_name]
            original_functions = metrics_functions[base_name]['functions']
            if original_functions is not None:
                functions = {}
                functions['nonNegativeDerivative'] = None
                for function in list(original_functions.keys()):
                    functions[function] = original_functions[function]
            else:
                functions = {'nonNegativeDerivative': None}
            get_metrics_with_functions[base_name]['functions'] = functions
        else:
            get_metrics_with_functions[base_name] = metrics_functions[base_name]

    metrics_list = list(get_metrics_with_functions.keys())

    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
    if metrics_list:
        metrics_to_get = []
        while len(metrics_list) > 0:
            metrics_to_get = []
            post_content = 'format=json&from=%s&until=%s' % (str(from_timestamp,), str(until_timestamp))
            for index, metric in enumerate(metrics_list):
                if len(metrics_to_get) < MAX_GRAPHITE_TARGETS:
                    metrics_to_get.append(metric)
                    metrics_list.pop(index)
                else:
                    break
            for base_name in metrics_to_get:
                functions = get_metrics_with_functions[base_name]['functions']
                target = base_name
                if functions is not None:
                    for function in list(functions.keys()):
                        function_arguments = functions[function]
                        if function_arguments is None:
                            target = '%s(%s)' % (function, target)
                        if isinstance(function_arguments, int):
                            target = '%s(%s,%s)' % (function, target, function_arguments)
                        if isinstance(function_arguments, str):
                            target = '%s(%s,"%s")' % (function, target, function_arguments)
                        if isinstance(function_arguments, dict):
                            target = '%s(%s' % (function, target)
                            for function_parmeter in list(function_arguments.keys()):
                                function_parmeter_value = function_arguments[function_parmeter]
                                if function_parmeter_value is None:
                                    target = str(target)
                                if isinstance(function_parmeter_value, int):
                                    target = '%s,%s' % (target, function_parmeter_value)
                                if isinstance(function_parmeter_value, str):
                                    target = '%s,"%s"' % (target, function_parmeter_value)
                            target = '%s)' % target
                get_metrics_with_functions[base_name]['target'] = target
                post_content = '%s&target=%s' % (post_content, target)

            graphite_json_fetched = False
            try:
                r = requests.post(graphite_url, data=post_content, headers=headers, timeout=use_timeout)
                graphite_json_fetched = True
            except Exception as e:
                if not log:
                    current_skyline_app_logger = current_skyline_app + 'Log'
                    current_logger = logging.getLogger(current_skyline_app_logger)
                current_logger.error('error :: %s :: data retrieval from Graphite failed - %s' % (
                    function_str, e))

            js = {}
            if graphite_json_fetched:
                try:
                    js = r.json()
                except Exception as e:
                    if not log:
                        current_skyline_app_logger = current_skyline_app + 'Log'
                        current_logger = logging.getLogger(current_skyline_app_logger)
                    current_logger.error('error :: %s :: failed to parse retrieved json - %s' % (
                        function_str, e))

            for item in js:
                data_error = None
                timeseries = None
                base_name = None
                try:
                    target = item['target']
                    for metric_base_name in metrics_to_get:
                        if metric_base_name in target:
                            base_name = metric_base_name
                    if not base_name:
                        if not log:
                            current_skyline_app_logger = current_skyline_app + 'Log'
                            current_logger = logging.getLogger(current_skyline_app_logger)
                        current_logger.error('error :: %s :: failed to determine base_name from get_metrics_with_functions[metric_base_name] with target: %s' % (
                            function_str, str(target)))
                        continue
                    datapoints = item['datapoints']
                    converted = []
                    for datapoint in datapoints:
                        try:
                            new_datapoint = [int(datapoint[1]), float(datapoint[0])]
                            converted.append(new_datapoint)
                        except Exception as e:
                            data_error = e
                    timeseries = converted
                except Exception as e:
                    if not log:
                        current_skyline_app_logger = current_skyline_app + 'Log'
                        current_logger = logging.getLogger(current_skyline_app_logger)
                    current_logger.error('error :: %s :: failed to parse data points from retrieved json data_error: %s - %s' % (
                        function_str, str(data_error), e))
                if base_name:
                    metrics_timeseries[base_name] = {}
                    metrics_timeseries[base_name]['functions'] = get_metrics_with_functions[base_name]['functions']
                    metrics_timeseries[base_name]['timeseries'] = None
                if timeseries:
                    metrics_timeseries[base_name]['timeseries'] = timeseries

    return metrics_timeseries
Beispiel #29
0
def thunder_stale_metrics(current_skyline_app, log=True):
    """
    Determine stale metrics in each top level namespace.

    :param current_skyline_app: the app calling the function
    :param log: whether to log or not, optional, defaults to True
    :type current_skyline_app: str
    :type log: boolean
    :return: (namespace_stale_metrics_dict, namespace_recovered_metrics_dict)
    :rtype: tuple

    """

    if current_skyline_app == 'analyzer':
        function_str = 'metrics_manager :: functions.thunder.thunder_stale_metrics'
    if current_skyline_app == 'webapp':
        function_str = 'functions.thunder.thunder_stale_metrics'

    if log:
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
    else:
        current_logger = None

    def get_sparsity(base_name):
        """
        Determine the metric sparsity
        """
        success = True
        sparsity = None
        timeseries = []
        try:
            timeseries = get_metric_timeseries(current_skyline_app, base_name)
        except Exception as e:
            success = e
            sparsity = None
        if timeseries:
            try:
                sparsity = determine_data_sparsity(current_skyline_app,
                                                   timeseries, None, False)
            except Exception as e:
                success = e
                sparsity = None
        else:
            success = 'no timeseries data'
            sparsity = None

        return success, sparsity

    now = int(time())
    namespace_stale_metrics_dict = {}
    namespace_recovered_metrics_dict = {}
    alerted_on_stale_metrics_dict = {}

    metrics_last_timestamp_dict = {}
    hash_key = 'analyzer.metrics.last_timeseries_timestamp'
    try:
        redis_conn_decoded = get_redis_conn_decoded(current_skyline_app)
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: failed to get Redis connection - %s' %
            (function_str, e))
        return namespace_stale_metrics_dict

    try:
        metrics_last_timestamp_dict = redis_conn_decoded.hgetall(hash_key)
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: failed to get Redis hash key %s - %s' %
            (function_str, hash_key, e))
    if not metrics_last_timestamp_dict:
        return namespace_stale_metrics_dict

    # Do not send stale alerts for any identified sparsely populated metrics
    metrics_sparsity_dict = {}
    data_sparsity_hash_key = 'analyzer.metrics_manager.hash_key.metrics_data_sparsity'
    try:
        metrics_sparsity_dict = redis_conn_decoded.hgetall(
            data_sparsity_hash_key)
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: failed to get Redis hash key %s - %s' %
            (function_str, data_sparsity_hash_key, e))
    sparsely_populated_metrics = []
    metrics_of_known_sparsity = []
    base_names_of_known_sparsity = []
    if metrics_sparsity_dict:
        metrics_of_known_sparsity = list(metrics_sparsity_dict.keys())
        for metric_name in metrics_of_known_sparsity:
            metric_name = str(metric_name)
            if metric_name.startswith(settings.FULL_NAMESPACE):
                base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1)
            else:
                base_name = metric_name
            base_names_of_known_sparsity.append(base_name)
            sparsity = metrics_sparsity_dict[metric_name]
            if float(sparsity) < settings.SPARSELY_POPULATED_PERCENTAGE:
                sparsely_populated_metrics.append(base_name)
        del metrics_sparsity_dict

    # @added 20210617 - Feature #4144: webapp - stale_metrics API endpoint
    # On webapp report on sparsely populated metrics as well
    exclude_sparsely_populated = False
    if current_skyline_app == 'webapp':
        try:
            exclude_sparsely_populated = redis_conn_decoded.get(
                'webapp.stale_metrics.exclude_sparsely_populated')
            if log:
                current_logger.info(
                    '%s :: Redis key webapp.stale_metrics.exclude_sparsely_populated - %s'
                    % (function_str, str(exclude_sparsely_populated)))
        except Exception as e:
            if not log:
                current_skyline_app_logger = current_skyline_app + 'Log'
                current_logger = logging.getLogger(current_skyline_app_logger)
            current_logger.error(traceback.format_exc())
            current_logger.error(
                'error :: %s :: failed to get Redis key webapp.stale_metrics.exclude_sparsely_populated - %s'
                % (function_str, e))
        if not exclude_sparsely_populated:
            sparsely_populated_metrics = []

    # Get all alerted on stale metrics
    alerted_on_stale_metrics_hash_key = 'thunder.alerted_on.stale_metrics'
    try:
        alerted_on_stale_metrics_dict = redis_conn_decoded.hgetall(
            alerted_on_stale_metrics_hash_key)
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: failed to get Redis hash key %s - %s' %
            (function_str, alerted_on_stale_metrics_hash_key, e))
    alerted_on_stale_metrics = []
    if alerted_on_stale_metrics_dict:
        alerted_on_stale_metrics = list(alerted_on_stale_metrics_dict.keys())

    # @added 20210617 - Feature #4144: webapp - stale_metrics API endpoint
    # On webapp report on alerted on metrics as well
    if current_skyline_app == 'webapp':
        alerted_on_stale_metrics = []

    # Get all the known custom stale periods
    custom_stale_metrics_dict = {}
    custom_stale_metrics_hash_key = 'analyzer.metrics_manager.custom_stale_periods'
    try:
        custom_stale_metrics_dict = redis_conn_decoded.hgetall(
            custom_stale_metrics_hash_key)
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: failed to create custom_stale_metrics_dict from Redis hash key %s - %s'
            % (function_str, custom_stale_metrics_hash_key, e))
    custom_stale_metrics = []
    if custom_stale_metrics_dict:
        custom_stale_metrics = list(custom_stale_metrics_dict.keys())

    metrics_last_timestamps = []
    parent_namespaces = []
    unique_base_names = list(metrics_last_timestamp_dict.keys())
    last_traceback = None
    last_error = None
    error_count = 0
    for base_name in unique_base_names:
        try:
            parent_namespace = base_name.split('.')[0]
            metrics_last_timestamps.append(
                [base_name,
                 int(metrics_last_timestamp_dict[base_name])])
            if len(parent_namespace) > 0:
                parent_namespaces.append(parent_namespace)
        except Exception as e:
            last_traceback = traceback.format_exc()
            last_error = e
            error_count += 1
    if last_error:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(
            'error :: %s :: errors %s encounterd while creating metrics_last_timestamps, last reported error - %s'
            % (function_str, str(error_count), last_error))
        current_logger.error('error :: %s :: last reported Traceback' %
                             (function_str))
        current_logger.error('%s' % (str(last_traceback)))

    total_stale_metrics_count = 0
    total_recovered_metrics_count = 0
    test_stale_metrics_namespaces = []

    # @added 20220208 - Feature #4376: webapp - update_external_settings
    # If alert_on_stale_metrics is not enabled for an external_settings namespace
    # do not alert
    do_not_alert_on_namespaces = []

    parent_namespaces = list(set(parent_namespaces))

    # @added 20210620 - Branch #1444: thunder
    #                   Feature #4076: CUSTOM_STALE_PERIOD
    # Handle multi level namespaces
    external_settings = {}
    try:
        external_settings = get_external_settings(current_skyline_app)
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: get_external_settings failed - %s' %
            (function_str, e))
    external_parent_namespaces_stale_periods = {}
    if external_settings:
        for config_id in list(external_settings.keys()):
            alert_on_stale_metrics = False
            try:
                alert_on_stale_metrics = external_settings[config_id][
                    'alert_on_stale_metrics']['enabled']
            except KeyError:
                alert_on_stale_metrics = False
            stale_metrics_stale_period = settings.STALE_PERIOD
            if alert_on_stale_metrics:
                try:
                    stale_metrics_stale_period = external_settings[config_id][
                        'alert_on_stale_metrics']['stale_period']
                except KeyError:
                    stale_metrics_stale_period = settings.STALE_PERIOD
            namespace = None
            if stale_metrics_stale_period:
                try:
                    namespace = external_settings[config_id]['namespace']
                except KeyError:
                    namespace = False

            # @added 20220208 - Feature #4376: webapp - update_external_settings
            # If alert_on_stale_metrics is not enabled do not alert
            if not alert_on_stale_metrics:
                do_not_alert_on_namespaces.append(namespace)
                namespace = None

            try:
                expiry = external_settings[config_id][
                    'alert_on_stale_metrics']['expiry']
            except KeyError:
                expiry = 1800
            if namespace and alert_on_stale_metrics and expiry:
                external_parent_namespaces_stale_periods[parent_namespace] = {}
                external_parent_namespaces_stale_periods[parent_namespace][
                    'stale_period'] = int(stale_metrics_stale_period)
                external_parent_namespaces_stale_periods[parent_namespace][
                    'expiry'] = int(expiry)
    external_parent_namespaces = []
    if external_parent_namespaces:
        # external_parent_namespaces = list(external_parent_namespaces.keys())
        external_parent_namespaces = list(
            external_parent_namespaces_stale_periods.keys())

    parent_namespace_metrics_processed = []
    custom_stale_period_namespaces = []
    # Sort the list by the namespaces with the most elements to the least as
    # first match wins
    if settings.CUSTOM_STALE_PERIOD:
        custom_stale_period_namespaces = list(
            settings.CUSTOM_STALE_PERIOD.keys())
        custom_stale_period_namespaces_elements_list = []
        for custom_stale_period_namespace in custom_stale_period_namespaces:
            namespace_elements = len(custom_stale_period_namespace.split('.'))
            custom_stale_period_namespaces_elements_list.append(
                [custom_stale_period_namespace, namespace_elements])
        sorted_custom_stale_period_namespaces = sorted(
            custom_stale_period_namespaces_elements_list,
            key=lambda x: (x[1]),
            reverse=True)
        if sorted_custom_stale_period_namespaces:
            custom_stale_period_namespaces = [
                x[0] for x in sorted_custom_stale_period_namespaces
            ]
    # Order by setting priority
    parent_namespaces = external_parent_namespaces + custom_stale_period_namespaces + parent_namespaces

    for parent_namespace in parent_namespaces:

        # @added 20220208 - Feature #4376: webapp - update_external_settings
        # If alert_on_stale_metrics is not enabled do not alert
        if parent_namespace in do_not_alert_on_namespaces:
            continue

        parent_namespace_stale_metrics_count = 0
        namespace_stale_metrics_dict[parent_namespace] = {}
        namespace_stale_metrics_dict[parent_namespace]['metrics'] = {}

        namespace_recovered_metrics_dict[parent_namespace] = {}
        namespace_recovered_metrics_dict[parent_namespace]['metrics'] = {}

        # metrics that are in the parent namespace
        parent_namespace_metrics = [
            item for item in metrics_last_timestamps
            if str(item[0]).startswith(parent_namespace)
        ]
        unfiltered_parent_namespace_metrics_count = len(
            parent_namespace_metrics)
        # @added 20210620 - Branch #1444: thunder
        #                   Feature #4076: CUSTOM_STALE_PERIOD
        # Handle multi level namespaces by filtering out metrics that have
        # already been processed in a longer parent_namespace
        parent_namespace_metrics = [
            item for item in parent_namespace_metrics
            if str(item[0]) not in parent_namespace_metrics_processed
        ]
        if parent_namespace_metrics:
            parent_namespace_metric_names = [
                item[0] for item in parent_namespace_metrics
            ]
            parent_namespace_metrics_processed = parent_namespace_metrics_processed + parent_namespace_metric_names
        if log:
            current_logger.info(
                '%s :: checking stale metrics in the \'%s.\' namespace on %s metrics (of %s filtered by processed)'
                % (function_str, parent_namespace,
                   str(len(parent_namespace_metrics)),
                   str(unfiltered_parent_namespace_metrics_count)))

        # Now check metrics that are default STALE_PERIOD metrics and are not
        # CUSTOM_STALE_PERIOD metrics
        last_error = None
        stale_period_parent_namespace_metrics = [
            item for item in parent_namespace_metrics
            if item[0] not in custom_stale_metrics
        ]
        for base_name, timestamp in stale_period_parent_namespace_metrics:
            if base_name in sparsely_populated_metrics:
                continue
            try:
                # Only alert once on stale metrics and identify as recovered
                if base_name in alerted_on_stale_metrics:
                    if int(timestamp) > (now - settings.STALE_PERIOD):
                        namespace_recovered_metrics_dict[parent_namespace][
                            'metrics'][base_name] = int(timestamp)
                        total_recovered_metrics_count += 1
                    else:
                        continue
                if int(timestamp) < (now - settings.STALE_PERIOD):

                    # Determine the metric sparsity if it is not known
                    if base_name not in base_names_of_known_sparsity:
                        success = None
                        sparsity = None
                        try:
                            success, sparsity = get_sparsity(base_name)
                            if sparsity is not None:
                                if float(
                                        sparsity
                                ) < settings.SPARSELY_POPULATED_PERCENTAGE:
                                    if current_skyline_app == 'analyzer':
                                        sparsely_populated_metrics.append(
                                            base_name)
                                        continue
                                    if current_skyline_app == 'webapp' and exclude_sparsely_populated:
                                        sparsely_populated_metrics.append(
                                            base_name)
                                        continue
                            else:
                                if success is not True:
                                    if not log:
                                        current_skyline_app_logger = current_skyline_app + 'Log'
                                        current_logger = logging.getLogger(
                                            current_skyline_app_logger)
                                    current_logger.error(
                                        'error :: %s :: get_sparsity failed for %s - %s'
                                        % (function_str, base_name,
                                           str(success)))
                        except Exception as e:
                            if not log:
                                current_skyline_app_logger = current_skyline_app + 'Log'
                                current_logger = logging.getLogger(
                                    current_skyline_app_logger)
                            current_logger.error(
                                'error :: %s :: get_sparsity failed for %s - %s'
                                % (function_str, base_name, e))

                    namespace_stale_metrics_dict[parent_namespace]['metrics'][
                        base_name] = timestamp
                    total_stale_metrics_count += 1
                    parent_namespace_stale_metrics_count += 1
            except Exception as e:
                last_traceback = traceback.format_exc()
                last_error = e
                error_count += 1
        if last_error:
            if not log:
                current_skyline_app_logger = current_skyline_app + 'Log'
                current_logger = logging.getLogger(current_skyline_app_logger)
            current_logger.error(
                'error :: %s :: errors %s encounterd while determining stale_period_parent_namespace_metrics, last reported error - %s'
                % (function_str, str(error_count), last_error))
            current_logger.error('error :: %s :: last reported Traceback' %
                                 (function_str))
            current_logger.error('%s' % (str(last_traceback)))

        # Now check metrics that are CUSTOM_STALE_PERIOD metrics
        custom_stale_period_parent_namespace_metrics = [
            item for item in parent_namespace_metrics
            if item[0] in custom_stale_metrics
        ]
        last_error = None
        for base_name, timestamp in custom_stale_period_parent_namespace_metrics:
            if base_name in sparsely_populated_metrics:
                continue
            try:
                # Only alert once on stale metrics and identify as recovered
                if base_name in alerted_on_stale_metrics:
                    if int(timestamp) > (
                            now - int(custom_stale_metrics_dict[base_name])):
                        namespace_recovered_metrics_dict[parent_namespace][
                            'metrics'][base_name] = int(timestamp)
                        total_recovered_metrics_count += 1
                    else:
                        continue
                if int(timestamp) < (
                        now - int(custom_stale_metrics_dict[base_name])):

                    # Determine the metric sparsity if it is not known
                    if base_name not in base_names_of_known_sparsity:
                        success = None
                        sparsity = None
                        try:
                            success, sparsity = get_sparsity(base_name)
                            if sparsity is not None:
                                if float(
                                        sparsity
                                ) < settings.SPARSELY_POPULATED_PERCENTAGE:
                                    # @modified 20210617 - Feature #4144: webapp - stale_metrics API endpoint
                                    # On webapp report on sparsely_populated_metrics on metrics as well
                                    if current_skyline_app == 'analyzer':
                                        sparsely_populated_metrics.append(
                                            base_name)
                                        continue
                                    if current_skyline_app == 'webapp' and exclude_sparsely_populated:
                                        sparsely_populated_metrics.append(
                                            base_name)
                                        continue
                            else:
                                if success is not True:
                                    if not log:
                                        current_skyline_app_logger = current_skyline_app + 'Log'
                                        current_logger = logging.getLogger(
                                            current_skyline_app_logger)
                                    current_logger.error(
                                        'error :: %s :: get_sparsity failed for %s - %s'
                                        % (function_str, base_name,
                                           str(success)))
                        except Exception as e:
                            if not log:
                                current_skyline_app_logger = current_skyline_app + 'Log'
                                current_logger = logging.getLogger(
                                    current_skyline_app_logger)
                            current_logger.error(
                                'error :: %s :: get_sparsity failed for %s - %s'
                                % (function_str, base_name, e))

                    namespace_stale_metrics_dict[parent_namespace]['metrics'][
                        base_name] = timestamp
                    total_stale_metrics_count += 1
                    parent_namespace_stale_metrics_count += 1
            except Exception as e:
                last_traceback = traceback.format_exc()
                last_error = e
                error_count += 1
        if last_error:
            if not log:
                current_skyline_app_logger = current_skyline_app + 'Log'
                current_logger = logging.getLogger(current_skyline_app_logger)
            current_logger.error(
                'error :: %s :: errors %s encounterd while determining custom_stale_period_parent_namespace_metrics, last reported error - %s'
                % (function_str, str(error_count), last_error))
            current_logger.error('error :: %s :: last reported Traceback' %
                                 (function_str))
            current_logger.error('%s' % (str(last_traceback)))

        if parent_namespace_stale_metrics_count:
            if log:
                current_logger.info(
                    '%s :: %s stale metrics found for %s' %
                    (function_str, str(parent_namespace_stale_metrics_count),
                     parent_namespace))

        # Allow to test
        if not parent_namespace_stale_metrics_count:
            # Allow to test
            thunder_test_alert_key_data = None
            thunder_test_alert_key = 'thunder.test.alert.stale_metrics.%s' % parent_namespace
            try:
                thunder_test_alert_key_data = redis_conn_decoded.get(
                    thunder_test_alert_key)
            except Exception as e:
                if not log:
                    current_skyline_app_logger = current_skyline_app + 'Log'
                    current_logger = logging.getLogger(
                        current_skyline_app_logger)
                current_logger.error(traceback.format_exc())
                current_logger.error(
                    'error :: %s :: failed to get Redis key %s - %s' %
                    (function_str, thunder_test_alert_key, e))
            if thunder_test_alert_key_data:
                try:
                    thunder_test_data = literal_eval(
                        thunder_test_alert_key_data)
                    stale_period = thunder_test_data['stale_period']
                    expiry = thunder_test_data['expiry']
                    stale_count = thunder_test_data['stale_count']
                    if log:
                        current_logger.info(
                            '%s :: THUNDER STALE_METRICS TEST REQUESTED FOR - \'%s.\' namespace using TEST stale_period of %s and expiry of %s for %s metrics'
                            %
                            (function_str, parent_namespace, str(stale_period),
                             str(expiry), str(stale_count)))
                except Exception as e:
                    if not log:
                        current_skyline_app_logger = current_skyline_app + 'Log'
                        current_logger = logging.getLogger(
                            current_skyline_app_logger)
                    current_logger.error(traceback.format_exc())
                    current_logger.error(
                        'error :: %s :: failed to get stale_period, expiry and stale_count for Redis key %s - %s'
                        % (function_str, thunder_test_alert_key, e))
                for base_name, timestamp in parent_namespace_metrics[
                        -stale_count:]:
                    namespace_stale_metrics_dict[parent_namespace]['metrics'][
                        base_name] = timestamp
                    total_stale_metrics_count += 1
                    parent_namespace_stale_metrics_count += 1
                test_stale_metrics_count = len(
                    list(namespace_stale_metrics_dict[parent_namespace]
                         ['metrics'].keys()))
                test_stale_metrics_namespaces.append(parent_namespace)
                if log:
                    current_logger.info(
                        '%s :: THUNDER STALE_METRICS TEST REQUESTED FOR - \'%s.\' namespace sending %s TEST stale_metrics'
                        % (function_str, parent_namespace,
                           str(test_stale_metrics_count)))

    if log:
        current_logger.info('%s :: total stale metrics found - %s' %
                            (function_str, str(total_stale_metrics_count)))
        current_logger.info('%s :: total recovered stale metrics - %s' %
                            (function_str, str(total_recovered_metrics_count)))
        current_logger.info(
            '%s :: skipped checking %s sparsely_populated_metrics' %
            (function_str, str(len(sparsely_populated_metrics))))

    # @modified 20210617 - Feature #4144: webapp - stale_metrics API endpoint
    # On webapp request do not send thunder events
    # if namespace_stale_metrics_dict:
    if namespace_stale_metrics_dict and current_skyline_app == 'analyzer':
        parent_namespaces = list(namespace_stale_metrics_dict.keys())
        for parent_namespace in parent_namespaces:
            stale_metrics = list(namespace_stale_metrics_dict[parent_namespace]
                                 ['metrics'].keys())
            if len(stale_metrics) > 0:

                # Check if there is a thunder.alert.no_data Redis key for the
                # namespace and skip if there is
                thunder_no_data_alert_key_exists = False
                thunder_no_data_alert_key = 'thunder.alert.no_data.%s' % parent_namespace
                try:
                    thunder_no_data_alert_key_exists = redis_conn_decoded.get(
                        thunder_no_data_alert_key)
                except Exception as e:
                    if not log:
                        current_skyline_app_logger = current_skyline_app + 'Log'
                        current_logger = logging.getLogger(
                            current_skyline_app_logger)
                    current_logger.error(traceback.format_exc())
                    current_logger.error(
                        'error :: %s :: failed to get Redis key %s - %s' %
                        (function_str, thunder_no_data_alert_key, e))
                if thunder_no_data_alert_key_exists:
                    if log:
                        current_logger.info(
                            '%s :: skipping sending thunder event for stale metrics on %s as thunder no_data alert key exists for the namespace'
                            % (function_str, parent_namespace))
                    continue

                # Check if there is a thunder.alert.analyzer.up.alert Redis key for the
                # namespace and skip if there is
                thunder_analyzer_alert_key_exists = False
                thunder_analyzer_alert_key = 'thunder.alert.analyzer.up.alert'
                try:
                    thunder_analyzer_alert_key_exists = redis_conn_decoded.get(
                        thunder_analyzer_alert_key)
                except Exception as e:
                    if not log:
                        current_skyline_app_logger = current_skyline_app + 'Log'
                        current_logger = logging.getLogger(
                            current_skyline_app_logger)
                    current_logger.error(traceback.format_exc())
                    current_logger.error(
                        'error :: %s :: failed to get Redis key %s - %s' %
                        (function_str, thunder_analyzer_alert_key, e))
                if thunder_analyzer_alert_key_exists:
                    if log:
                        current_logger.info(
                            '%s :: skipping sending thunder event for stale metrics on %s as thunder analyzer alert key exists'
                            % (function_str, parent_namespace))
                    continue

                level = 'alert'
                event_type = 'stale_metrics'
                message = '%s - %s - no new data for %s metrics' % (
                    level, parent_namespace, str(len(stale_metrics)))
                status = 'not recieving data for some metrics'
                if parent_namespace in test_stale_metrics_namespaces:
                    message = '%s - %s - no new data for %s metrics - TEST' % (
                        level, parent_namespace, str(len(stale_metrics)))
                    status = 'not recieving data for some metrics - TEST'
                thunder_event = {
                    'level': level,
                    'event_type': event_type,
                    'message': message,
                    'app': current_skyline_app,
                    'metric': None,
                    'source': current_skyline_app,
                    'timestamp': time(),
                    'expiry': settings.STALE_PERIOD,
                    'data': {
                        'namespace': parent_namespace,
                        'stale_metrics': stale_metrics,
                        'status': status,
                    },
                }
                submitted = False
                try:
                    submitted = thunder_send_event(current_skyline_app,
                                                   thunder_event,
                                                   log=True)
                except Exception as e:
                    if not log:
                        current_skyline_app_logger = current_skyline_app + 'Log'
                        current_logger = logging.getLogger(
                            current_skyline_app_logger)
                    current_logger.error(
                        'error :: %s :: error encounterd with thunder_send_event - %s'
                        % (function_str, e))
                if submitted:
                    if log:
                        current_logger.info(
                            '%s :: send thunder event for %s stale metrics on namespace %s'
                            % (function_str, str(
                                len(stale_metrics)), parent_namespace))

    # @modified 20210617 - Feature #4144: webapp - stale_metrics API endpoint
    # On webapp request do not send thunder events
    # if namespace_recovered_metrics_dict and total_recovered_metrics_count:
    if namespace_recovered_metrics_dict and total_recovered_metrics_count and current_skyline_app == 'analyzer':
        parent_namespaces = list(namespace_recovered_metrics_dict.keys())
        for parent_namespace in parent_namespaces:
            stale_metrics = list(
                namespace_recovered_metrics_dict[parent_namespace]
                ['metrics'].keys())
            if len(stale_metrics) > 0:
                level = 'notice'
                event_type = 'stale_metrics'
                message = '%s - %s - new data for %s metrics' % (
                    level, parent_namespace, str(len(stale_metrics)))
                status = 'recovered'
                thunder_event = {
                    'level': level,
                    'event_type': event_type,
                    'message': message,
                    'app': current_skyline_app,
                    'metric': None,
                    'source': current_skyline_app,
                    'timestamp': time(),
                    'expiry': 59,
                    'data': {
                        'namespace': parent_namespace,
                        'stale_metrics': stale_metrics,
                        'status': status,
                    },
                }
                submitted = False
                try:
                    submitted = thunder_send_event(current_skyline_app,
                                                   thunder_event,
                                                   log=True)
                except Exception as e:
                    if not log:
                        current_skyline_app_logger = current_skyline_app + 'Log'
                        current_logger = logging.getLogger(
                            current_skyline_app_logger)
                    current_logger.error(
                        'error :: %s :: error encounterd with thunder_send_event - %s'
                        % (function_str, e))
                if submitted:
                    if log:
                        current_logger.info(
                            '%s :: send thunder event for %s stale metrics on namespace %s'
                            % (function_str, str(
                                len(stale_metrics)), parent_namespace))

    return namespace_stale_metrics_dict, namespace_recovered_metrics_dict
Beispiel #30
0
# @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow
# @modified 20191030 - Bug #3266: py3 Redis binary objects not strings
#                      Branch #3262: py3
# Use get_redis_conn and get_redis_conn_decoded to use on Redis sets when the bytes
# types need to be decoded as utf-8 to str
# if settings.REDIS_PASSWORD:
#     redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
# else:
#     redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)

# @added 20191030 - Bug #3266: py3 Redis binary objects not strings
#                   Branch #3262: py3
# Added a single functions to deal with Redis connection and the
# charset='utf-8', decode_responses=True arguments required in py3
redis_conn = get_redis_conn(skyline_app)
redis_conn_decoded = get_redis_conn_decoded(skyline_app)


def get_anomaly(request_type):
    """
    Query the database for the anomaly details
    """

    logger = logging.getLogger(skyline_app_logger)

    if isinstance(request_type, int):
        latest = False
    else:
        latest = True

    if latest: