Esempio n. 1
0
    def __init__(self, parent_pid):
        super(Aggregator, self).__init__()
        self.redis_conn = get_redis_conn(skyline_app)
        self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)

        self.parent_pid = parent_pid
        self.daemon = True
        self.current_pid = getpid()
Esempio n. 2
0
 def __init__(self, parent_pid):
     """
     Initialize the SNAB_flux_load_test
     """
     super(SNAB_flux_load_test, self).__init__()
     self.redis_conn = get_redis_conn(skyline_app)
     self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)
     self.daemon = True
     self.parent_pid = parent_pid
     self.current_pid = getpid()
Esempio n. 3
0
 def __init__(self, parent_pid):
     """
     Initialize Rolling
     """
     super(RollingThunder, self).__init__()
     self.redis_conn = get_redis_conn(skyline_app)
     self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)
     self.daemon = True
     self.parent_pid = parent_pid
     self.current_pid = getpid()
Esempio n. 4
0
 def __init__(self, parent_pid):
     """
     Initialize RelatedMetrics
     """
     super(RelatedMetrics, self).__init__()
     self.redis_conn = get_redis_conn(skyline_app)
     self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)
     self.daemon = True
     self.parent_pid = parent_pid
     self.current_pid = getpid()
Esempio n. 5
0
 def __init__(self, parent_pid):
     """
     Initialize Cloudbursts
     """
     super(Cloudbursts, self).__init__()
     self.redis_conn = get_redis_conn(skyline_app)
     self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)
     self.daemon = True
     self.parent_pid = parent_pid
     self.current_pid = getpid()
Esempio n. 6
0
def thunder_send_event(current_skyline_app, event, log=True):
    """
    Add an event to the thunder.events Redis set or the thunder check dir if
    Redis is not available.

    :param current_skyline_app: the app calling the function
    :param event: the event data
    :param log: whether to log or not, optional, defaults to False
    :type current_skyline_app: str
    :type event: dict
    :type log: boolean
    :return: submitted
    :rtype: boolean

    """

    function_str = 'functions.thunder.thunder_sent_event'
    if log:
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
    else:
        current_logger = None

    submitted = 0
    try:
        redis_conn = get_redis_conn(current_skyline_app)
        submitted = redis_conn.sadd('thunder.events', str(event))
        if submitted:
            return True
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: failed to add %s to thunder.events Redis set - %s'
            % (function_str, str(event), e))

    # If the thunder event was not added to Redis set, create the event_file
    if not path.exists(THUNDER_EVENTS_DIR):
        mkdir_p(THUNDER_EVENTS_DIR)
        current_logger.info('created dir - %s' % THUNDER_EVENTS_DIR)
    event_file = '%s/%s.thunder.event.dict' % (THUNDER_EVENTS_DIR, str(time()))
    try:
        write_data_to_file(current_skyline_app, event_file, 'w', str(event))
        current_logger.info('added thunder event file - %s' % event_file)
        submitted = True
    except Exception as e:
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: failed to add thunder event file - %s - %s' %
            (event_file, e))
        submitted = False

    return submitted
Esempio n. 7
0
    def __init__(self, parent_pid):
        """
        Initialize Luminosity

        Create the :obj:`redis_conn` a Redis client object
        Create the :obj:`correlations` list
        Create the :obj:`mysql_conn` MySQLConnection object
        Create the :obj:`memcache_client` a constructor that does not make a
        connection to memcached. The first call to a method on the object will
        do that.

        """
        super(Luminosity, self).__init__()
        # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow
        # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings
        #                      Branch #3262: py3
        # Use get_redis_conn and get_redis_conn_decoded to use on Redis sets when the bytes
        # types need to be decoded as utf-8 to str
        # if settings.REDIS_PASSWORD:
        #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
        # else:
        #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)

        # @added 20191030 - Bug #3266: py3 Redis binary objects not strings
        #                   Branch #3262: py3
        # Added a single functions to deal with Redis connection and the
        # charset='utf-8', decode_responses=True arguments required in py3
        self.redis_conn = get_redis_conn(skyline_app)
        self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)

        self.daemon = True
        self.parent_pid = parent_pid
        self.current_pid = getpid()
        # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage
        #                      Task #3032: Debug number of Python processes and memory use
        #                      Branch #3002: docker
        # Reduce amount of Manager instances that are used as each requires a
        # copy of entire memory to be copied into each subprocess so this
        # results in a python process per Manager instance, using as much
        # memory as the parent.  OK on a server, not so much in a container.
        # Disabled all the Manager().list() below and replaced with Redis sets
        # self.correlations = Manager().list()
        # @added 20180720 - Task #2462: Implement useful metrics for Luminosity
        # self.metrics_checked_for_correlation = Manager().list()
        # self.runtimes = Manager().list()
        self.mysql_conn = mysql.connector.connect(**config)
        if settings.MEMCACHE_ENABLED:
            self.memcache_client = pymemcache_Client(
                (settings.MEMCACHED_SERVER_IP, settings.MEMCACHED_SERVER_PORT),
                connect_timeout=0.1,
                timeout=0.2)
        else:
            self.memcache_client = None
Esempio n. 8
0
    def __init__(self, queue, parent_pid):
        super(Worker, self).__init__()
        # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings
        #                      Branch #3262: py3
        # if settings.REDIS_PASSWORD:
        #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
        # else:
        #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
        # @added 20191115 - Bug #3266: py3 Redis binary objects not strings
        #                   Branch #3262: py3
        self.redis_conn = get_redis_conn(skyline_app)
        self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)

        self.q = queue
        self.parent_pid = parent_pid
        self.daemon = True
Esempio n. 9
0
 def __init__(self, parent_pid):
     super(Worker, self).__init__()
     self.parent_pid = parent_pid
     self.daemon = True
     # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings
     #                      Branch #3262: py3
     # if settings.REDIS_PASSWORD:
     #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
     # else:
     #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
     # @added 20191111 - Bug #3266: py3 Redis binary objects not strings
     #                   Branch #3262: py3
     # Added a single functions to deal with Redis connection and the
     # charset='utf-8', decode_responses=True arguments required in py3
     self.redis_conn = get_redis_conn(skyline_app)
     self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)
Esempio n. 10
0
def is_anomalously_anomalous(metric_name, ensemble, datapoint):
    """
    This method runs a meta-analysis on the metric to determine whether the
    metric has a past history of triggering. TODO: weight intervals based on datapoint
    """
    # We want the datapoint to avoid triggering twice on the same data
    new_trigger = [time(), datapoint]

    # Get the old history
    # @added 20200505 - Feature #3504: Handle airgaps in batch metrics
    # Use get_redis_conn
    from skyline_functions import get_redis_conn
    redis_conn = get_redis_conn(skyline_app)

    raw_trigger_history = redis_conn.get('trigger_history.' + metric_name)
    if not raw_trigger_history:
        redis_conn.set('trigger_history.' + metric_name,
                       packb([(time(), datapoint)]))
        return True

    trigger_history = unpackb(raw_trigger_history)

    # Are we (probably) triggering on the same data?
    if (new_trigger[1] == trigger_history[-1][1]
            and new_trigger[0] - trigger_history[-1][0] <= 300):
        return False

    # Update the history
    trigger_history.append(new_trigger)
    redis_conn.set('trigger_history.' + metric_name, packb(trigger_history))

    # Should we surface the anomaly?
    trigger_times = [x[0] for x in trigger_history]
    intervals = [
        trigger_times[i + 1] - trigger_times[i]
        for i, v in enumerate(trigger_times) if (i + 1) < len(trigger_times)
    ]

    series = pandas.Series(intervals)
    mean = series.mean()
    stdDev = series.std()

    return abs(intervals[-1] - mean) > 3 * stdDev
Esempio n. 11
0
    def __init__(self, parent_pid, skip_mini):
        super(Roomba, self).__init__()
        # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow
        # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings
        #                      Branch #3262: py3
        # if settings.REDIS_PASSWORD:
        #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
        # else:
        #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)

        # @added 20191030 - Bug #3266: py3 Redis binary objects not strings
        #                   Branch #3262: py3
        # Added a single functions to deal with Redis connection and the
        # charset='utf-8', decode_responses=True arguments required in py3
        self.redis_conn = get_redis_conn(skyline_app)
        self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)

        self.daemon = True
        self.parent_pid = parent_pid
        self.skip_mini = skip_mini
Esempio n. 12
0
def get_metric_timeseries(current_skyline_app, metric_name, log=True):
    """
    Return a metric time series as a list e.g.
    [[ts, value], [ts, value], ..., [ts, value]]

    :param current_skyline_app: the app calling the function
    :param metric_name: the full Redis metric name
    :param log: whether to log or not, optional, defaults to True
    :type current_skyline_app: str
    :type metric_name: str
    :type log: boolean
    :return: timeseries
    :rtype: list

    """

    function_str = 'functions.redis.get_metric_timeseries'
    if log:
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
    else:
        current_logger = None

    timeseries = []
    try:
        redis_conn = get_redis_conn(current_skyline_app)
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(
            'error :: %s :: failed to connect to Redis to fetch time series for %s - %s'
            % (function_str, metric_name, e))

    if metric_name.startswith(FULL_NAMESPACE):
        metric_name = str(metric_name)
    else:
        metric_name = '%s%s' % (FULL_NAMESPACE, str(metric_name))

    raw_series = None
    try:
        raw_series = redis_conn.get(metric_name)
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error('error :: failed to get %s from Redis - %s' %
                             (metric_name, e))
        raw_series = None

    if not raw_series:
        return timeseries

    try:
        unpacker = Unpacker(use_list=False)
        unpacker.feed(raw_series)
        timeseries = list(unpacker)
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(
            'error :: failed to unpack %s time series from Redis data - %s' %
            (metric_name, e))
        timeseries = []

    try:
        redis_conn_decoded = get_redis_conn_decoded(current_skyline_app)
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(
            'error :: %s :: failed to connect to Redis to get derivative_metrics - %s'
            % (function_str, e))
    derivative_metrics = []
    try:
        # @modified 20211012 - Feature #4280: aet.metrics_manager.derivative_metrics Redis hash
        # derivative_metrics = list(redis_conn_decoded.smembers('derivative_metrics'))
        derivative_metrics = list(
            redis_conn_decoded.smembers(
                'aet.metrics_manager.derivative_metrics'))
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(
            'error :: %s :: failed to connect to Redis for smembers of derivative_metrics - %s'
            % (function_str, e))
        derivative_metrics = []
    if metric_name in derivative_metrics:
        if len(timeseries) > 3:
            try:
                derivative_timeseries = nonNegativeDerivative(timeseries)
                timeseries = derivative_timeseries
            except Exception as e:
                if not log:
                    current_skyline_app_logger = current_skyline_app + 'Log'
                    current_logger = logging.getLogger(
                        current_skyline_app_logger)
                current_logger.error(
                    'error :: %s :: nonNegativeDerivative failed - %s' %
                    (function_str, e))

    return timeseries
Esempio n. 13
0
    def run(self):
        """
        Called when the process intializes.
        """

        logger.info('aggregator :: starting aggregator')

        # Determine a primary aggregator
        aggregator_pid = getpid()
        main_process_pid = 0
        try:
            main_process_pid = int(
                self.redis_conn_decoded.get('flux.main_process_pid'))
            if main_process_pid:
                logger.info(
                    'aggregator :: main_process_pid found in Redis key - %s' %
                    str(main_process_pid))
        except:
            main_process_pid = 0
        if not main_process_pid:
            logger.error(
                'error :: aggregator :: no main_process_pid known, exiting')
            sys.exit(1)

        primary_aggregator_key = 'flux.primary_aggregator_pid.%s' % str(
            main_process_pid)
        logger.info(
            'aggregator :: starting primary_aggregator election using primary_aggregator_key: %s'
            % primary_aggregator_key)
        sleep_for = random.uniform(0.1, 1.5)
        logger.info(
            'aggregator :: starting primary_aggregator election - sleeping for %s'
            % str(sleep_for))
        sleep(sleep_for)
        primary_aggregator_pid = 0
        try:
            primary_aggregator_pid = int(
                self.redis_conn_decoded.get(primary_aggregator_key))
            if primary_aggregator_pid:
                logger.info(
                    'aggregator :: primary_aggregator_pid found in Redis key - %s'
                    % str(primary_aggregator_pid))
        except:
            primary_aggregator_pid = 0
        if not primary_aggregator_pid:
            try:
                self.redis_conn.setex(primary_aggregator_key, 300,
                                      aggregator_pid)
                primary_aggregator_pid = int(
                    self.redis_conn_decoded.get(primary_aggregator_key))
                logger.info(
                    'aggregator :: set self pid to primary_aggregator - %s' %
                    str(primary_aggregator_pid))
            except:
                primary_aggregator_pid = 0
        primary_aggregator = False
        if primary_aggregator_pid == aggregator_pid:
            primary_aggregator = True
        logger.info(
            'aggregator :: primary_aggregator_pid is set to %s, primary_aggregator: %s'
            % (str(primary_aggregator_pid), str(primary_aggregator)))

        last_flush = int(time()) - 59
        remove_from_flux_queue_redis_set = []

        # Populate API keys and tokens in memcache
        # python-2.x and python3.x handle while 1 and while True differently
        # while 1:
        running = True
        while running:
            # Make sure Redis is up
            redis_up = False
            while not redis_up:
                try:
                    redis_up = self.redis_conn.ping()
                except:
                    logger.error(
                        'aggregator :: cannot connect to redis at socket path %s'
                        % (settings.REDIS_SOCKET_PATH))
                    sleep(2)
                    try:
                        self.redis_conn = get_redis_conn(skyline_app)
                    except Exception as e:
                        logger.error(
                            'error :: aggregator :: could not get_redis_conn - %s'
                            % str(e))
                    try:
                        self.redis_conn_decoded = get_redis_conn_decoded(
                            skyline_app)
                    except Exception as e:
                        logger.error(
                            'error :: aggregator :: could not get_redis_conn_decoded - %s'
                            % str(e))

            try:
                time_now = int(time())
                while (time_now - last_flush) <= 59:
                    sleep(1)
                    remove_from_flux_queue_redis_set = []
                    time_now = int(time())

                primary_aggregator_pid = 0
                try:
                    primary_aggregator_pid = int(
                        self.redis_conn_decoded.get(primary_aggregator_key))
                    if primary_aggregator_pid:
                        logger.info(
                            'aggregator :: primary_aggregator_pid found in Redis key - %s'
                            % str(primary_aggregator_pid))
                except:
                    primary_aggregator_pid = 0
                if not primary_aggregator_pid:
                    try:
                        self.redis_conn.setex(primary_aggregator_key, 300,
                                              aggregator_pid)
                        primary_aggregator_pid = int(
                            self.redis_conn_decoded.get(
                                primary_aggregator_key))
                        logger.info(
                            'aggregator :: set self pid to primary_aggregator - %s'
                            % str(primary_aggregator_pid))
                    except:
                        primary_aggregator_pid = 0
                primary_aggregator = False
                if primary_aggregator_pid == aggregator_pid:
                    primary_aggregator = True
                logger.info(
                    'aggregator :: primary_aggregator_pid is set to %s, primary_aggregator: %s'
                    % (str(primary_aggregator_pid), str(primary_aggregator)))

                flux_aggregator_queue = []
                if primary_aggregator:
                    logger.info('aggregator :: checking for data to aggregate')
                    try:
                        flux_aggregator_queue = self.redis_conn_decoded.smembers(
                            'flux.aggregator.queue')
                        logger.info(
                            'aggregator :: %s entries in flux.aggregator.queue to process'
                            % str(len(flux_aggregator_queue)))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: could not get the flux.aggregator.queue set from Redis'
                        )
                else:
                    logger.info(
                        'aggregator :: not primary, in standby to take over should the primary_aggregator fail'
                    )

                flux_aggregator_queue_items = []
                all_metrics = []
                if flux_aggregator_queue:
                    for flux_aggregator_queue_item_str in flux_aggregator_queue:
                        try:
                            flux_aggregator_queue_item = literal_eval(
                                flux_aggregator_queue_item_str)
                            all_metrics.append(flux_aggregator_queue_item[0])
                            flux_aggregator_queue_items.append([
                                flux_aggregator_queue_item,
                                flux_aggregator_queue_item_str
                            ])
                            # self.redis_conn.srem('flux.aggregator.queue', flux_aggregator_queue_item_str)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: failed to evaluate item from flux.aggregator.queue Redis set'
                            )
                metrics = list(set(all_metrics))
                for metric in metrics:
                    last_metric_flush = last_flush
                    last_metric_flush_str = None
                    try:
                        last_metric_flush_str = self.redis_conn_decoded.hget(
                            'flux.aggregate_metrics.last_flush', metric)
                        # Handle new metric without throwing an error if they do
                        # not have an entry in the hash
                        if last_metric_flush_str:
                            last_metric_flush = int(last_metric_flush_str)
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: failed convert last_metric_flush_str value to an int from flux.aggregate_metrics.last_flush Redis hash for %s'
                            % metric)
                    if not last_metric_flush:
                        # Handle new metric without throwing an error if they do
                        # not have an entry in the hash
                        logger.info(
                            'aggregator :: probable new metric - no last_metric_flush found in flux.aggregate_metrics.last_flush Redis hash for %s using last_flush'
                            % metric)
                        last_metric_flush = last_flush
                    metric_aggregation_settings = {}
                    try:
                        metric_aggregation_settings_str = self.redis_conn_decoded.hget(
                            'metrics_manager.flux.aggregate_namespaces.settings',
                            metric)
                        # @modified 20210718
                        if metric_aggregation_settings_str:
                            metric_aggregation_settings = literal_eval(
                                metric_aggregation_settings_str)
                        else:
                            metric_aggregation_settings = {}
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: failed to determine aggregation_settings from metrics_manager.flux.aggregate_namespaces.settings Redis hash for %s'
                            % metric)

                    # @added 20210718
                    # Handle newly added metrics that have not been added to
                    # metrics_manager.flux.aggregate_namespaces.settings due to
                    # to the chicken or the egg problem
                    if not metric_aggregation_settings:
                        logger.info(
                            'aggregator :: probable new metric - %s not found in metrics_manager.flux.aggregate_namespaces.settings Redis hash'
                            % metric)
                        aggregate_namespaces = list(
                            settings.FLUX_AGGREGATE_NAMESPACES.keys())
                        pattern_match, metric_matched_by = matched_or_regexed_in_list(
                            'flux', metric, aggregate_namespaces)
                        if pattern_match:
                            matched_namespace = metric_matched_by[
                                'matched_namespace']
                            metric_aggregation_settings = settings.FLUX_AGGREGATE_NAMESPACES[
                                matched_namespace]
                            logger.info(
                                'aggregator :: new metric - %s detemined metric_aggregation_settings from FLUX_AGGREGATE_NAMESPACES - %s'
                                % (metric, str(metric_aggregation_settings)))
                        else:
                            logger.error(
                                'error :: aggregator :: new metric - %s could not detemine metric_aggregation_settings from FLUX_AGGREGATE_NAMESPACES'
                                % (metric))

                    interval = 60
                    try:
                        interval = int(metric_aggregation_settings['interval'])
                    except:
                        # logger.error(traceback.format_exc())
                        logger.error(
                            'error :: failed to get interval from metric_aggregation_settings for %s, setting to default 60'
                            % metric)
                        interval = 60
                    if (time_now - last_metric_flush) < interval:
                        continue
                    metric_values = []
                    for flux_aggregator_queue_item in flux_aggregator_queue_items:
                        if flux_aggregator_queue_item[0][0] != metric:
                            continue
                        # Discard any values older than the last metric flush
                        if int(flux_aggregator_queue_item[0]
                               [2]) > last_metric_flush:
                            metric_values.append(
                                flux_aggregator_queue_item[0][1])
                        try:
                            self.redis_conn.srem('flux.aggregator.queue',
                                                 flux_aggregator_queue_item[1])
                            remove_from_flux_queue_redis_set.append(
                                flux_aggregator_queue_item[1])
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: failed to remove item from flux.aggregator.queue Redis set - %s'
                                % str(flux_aggregator_queue_item[1]))
                    if not metric_aggregation_settings:
                        logger.error(
                            'error :: no aggregation settings known for %s, discarding data'
                            % metric)
                        continue
                    if metric_values:
                        methods = []
                        try:
                            methods = metric_aggregation_settings['method']
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: failed to determine aggregation methods from metric_aggregation_settings - %s'
                                % str(metric_aggregation_settings))
                            methods = []
                        for method in methods:
                            try:
                                metric_namespace = metric
                                if metric_aggregation_settings[
                                        'method_suffix']:
                                    metric_namespace = '%s.%s' % (metric,
                                                                  method)
                                else:
                                    # @added 20220126 - Feature #4400: flux - quota
                                    # If method_suffix is not set but multiple
                                    # methods are being used, method_suffix
                                    # must be applied, otherwise the metric will
                                    # have all the method values submitted to a
                                    # single metric name.
                                    if len(methods) > 1:
                                        metric_namespace = '%s.%s' % (metric,
                                                                      method)
                                aggregate_value = None
                                if method == 'avg':
                                    if len(metric_values) > 1:
                                        aggregate_value = sum(
                                            metric_values) / len(metric_values)
                                    else:
                                        aggregate_value = metric_values[0]
                                if method == 'sum':
                                    aggregate_value = sum(metric_values)
                                if method == 'max':
                                    aggregate_value = max(metric_values)
                                if method == 'min':
                                    aggregate_value = min(metric_values)
                                if aggregate_value is not None:
                                    try:
                                        backfill = False
                                        metric_data = [
                                            metric_namespace, aggregate_value,
                                            (time_now - interval), backfill
                                        ]
                                        flux.httpMetricDataQueue.put(
                                            metric_data, block=False)
                                        logger.info('aggregator :: added %s' %
                                                    (str(metric_data)))
                                        try:
                                            self.redis_conn.hset(
                                                'flux.aggregate_metrics.last_flush',
                                                metric, time_now)
                                        except:
                                            logger.error(
                                                traceback.format_exc())
                                            logger.error(
                                                'error :: aggregator :: failed to set last metric flush time in Redis hash flux.aggregate_metrics.last_flush'
                                            )
                                    except:
                                        logger.error(traceback.format_exc())
                                        logger.error(
                                            'error :: aggregator :: failed to add aggregator data to flux.httpMetricDataQueue - %s'
                                            % str(metric_data))
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: aggregator :: failed to aggregate metric_values by a method for %s'
                                    % str(metric))

                last_flush = time_now

                # flux_zero_fill_metrics = list(self.redis_conn_decoded.smembers('flux.zero_fill_metrics'))

                if FLUX_PERSIST_QUEUE:
                    redis_set_size = 0
                    try:
                        redis_set_size = self.redis_conn.scard('flux.queue')
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: aggregator :: failed to determine size of flux.queue Redis set'
                        )
                    logger.info(
                        'aggregator :: flux.queue Redis set size of %s before removal of %s items'
                        % (str(redis_set_size),
                           str(len(remove_from_flux_queue_redis_set))))
                    if remove_from_flux_queue_redis_set:
                        try:
                            self.redis_conn.srem(
                                'flux.queue',
                                *set(remove_from_flux_queue_redis_set))
                            remove_from_flux_queue_redis_set = []
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: aggregator :: failed to remove multiple items from flux.queue Redis set'
                            )
                        try:
                            redis_set_size = self.redis_conn.scard(
                                'flux.queue')
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: aggregator :: failed to determine size of flux.queue Redis set'
                            )
                        logger.info(
                            'aggregator :: flux.queue Redis set size of %s after the removal of items'
                            % (str(redis_set_size)))
                        remove_from_flux_queue_redis_set = []

                if primary_aggregator:
                    try:
                        self.redis_conn.setex(primary_aggregator_key, 300,
                                              aggregator_pid)
                        primary_aggregator_pid = int(
                            self.redis_conn_decoded.get(
                                primary_aggregator_key))
                        logger.info(
                            'aggregator :: set self pid to primary_aggregator - %s'
                            % str(primary_aggregator_pid))
                        logger.info(
                            'aggregator :: set Redis primary_aggregator_key key to self pid to primary_aggregator - %s'
                            % str(primary_aggregator_pid))
                    except Exception as e:
                        logger.error(
                            'error :: aggregator :: failed to set Redis primary_aggregator_key key to self pid - %s'
                            % (str(e)))

            except NotImplementedError:
                pass
            except KeyboardInterrupt:
                logger.info(
                    'aggregator :: server has been issued a user signal to terminate - KeyboardInterrupt'
                )
            except SystemExit:
                logger.info(
                    'aggregator :: server was interrupted - SystemExit')
            except Exception as e:
                logger.error(traceback.format_exc())
                logger.error('error :: aggregator :: %s' % (str(e)))
Esempio n. 14
0
    def run(self):
        """
        Called when process initializes.
        """
        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        if os.path.isfile(skyline_app_logwait):
            try:
                os_remove(skyline_app_logwait)
            except OSError:
                logger.error('error - failed to remove %s, continuing' %
                             skyline_app_logwait)
                pass

        now = time()
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = time()
            else:
                now = log_wait_for + 1

        logger.info('starting %s run' % skyline_app)
        if os.path.isfile(skyline_app_loglock):
            logger.error(
                'error - bin/%s.d log management seems to have failed, continuing'
                % skyline_app)
            try:
                os_remove(skyline_app_loglock)
                logger.info('log lock file removed')
            except OSError:
                logger.error('error - failed to remove %s, continuing' %
                             skyline_app_loglock)
                pass
        else:
            logger.info('bin/%s.d log management done' % skyline_app)

        logger.info('%s :: started roomba' % skyline_app)

        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except:
                logger.error(
                    '%s :: roomba can\'t connect to redis at socket path %s' %
                    (skyline_app, settings.REDIS_SOCKET_PATH))
                sleep(10)
                # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow
                # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings
                #                      Branch #3262: py3
                if settings.REDIS_PASSWORD:
                    self.redis_conn = StrictRedis(
                        password=settings.REDIS_PASSWORD,
                        unix_socket_path=settings.REDIS_SOCKET_PATH)
                else:
                    self.redis_conn = StrictRedis(
                        unix_socket_path=settings.REDIS_SOCKET_PATH)
                # @added 20191115 - Bug #3266: py3 Redis binary objects not strings
                #                   Branch #3262: py3
                self.redis_conn = get_redis_conn(skyline_app)
                self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)

                continue

            # Spawn processes
            pids = []
            for i in range(1, settings.ROOMBA_PROCESSES + 1):
                if not self.skip_mini:
                    logger.info(
                        '%s :: starting vacuum process on mini namespace' %
                        skyline_app)
                    p = Process(target=self.vacuum,
                                args=(i, settings.MINI_NAMESPACE,
                                      settings.MINI_DURATION +
                                      settings.ROOMBA_GRACE_TIME))
                    pids.append(p)
                    p.start()

                logger.info('%s :: starting vacuum process' % skyline_app)
                p = Process(
                    target=self.vacuum,
                    args=(i, settings.FULL_NAMESPACE,
                          settings.FULL_DURATION + settings.ROOMBA_GRACE_TIME))
                pids.append(p)
                p.start()

            # Send wait signal to zombie processes
            # for p in pids:
            #     p.join()
            # deroomba - kill any lingering vacuum processes
            # Changed to manage Roomba processes as edge cases related to I/O
            # wait have been experienced that resulted in Roomba stalling so a
            # ROOMBA_TIMEOUT setting was added and here we use the pattern
            # described by http://stackoverflow.com/users/2073595/dano at
            # http://stackoverflow.com/a/26064238 to monitor and kill any
            # stalled processes rather than using p.join(TIMEOUT) - 20160505
            # @earthgecko ref 1342
            logger.info('%s :: allowing vacuum process/es %s seconds to run' %
                        (skyline_app, str(settings.ROOMBA_TIMEOUT)))
            start = time()
            while time() - start <= settings.ROOMBA_TIMEOUT:
                if any(p.is_alive() for p in pids):
                    # Just to avoid hogging the CPU
                    sleep(.1)
                else:
                    # All the processes are done, break now.
                    time_to_run = time() - start
                    logger.info('%s :: vacuum processes completed in %.2f' %
                                (skyline_app, time_to_run))
                    break
            else:
                # We only enter this if we didn't 'break' above.
                logger.info('%s :: timed out, killing all Roomba processes' %
                            (skyline_app))
                for p in pids:
                    p.terminate()
                    p.join()

            # sleeping in the main process is more CPU efficient than sleeping
            # in the vacuum def also roomba is quite CPU intensive so we only
            # what to run roomba once every minute
            process_runtime = time() - now
            roomba_optimum_run_duration = 60
            if process_runtime < roomba_optimum_run_duration:
                sleep_for = (roomba_optimum_run_duration - process_runtime)
                logger.info('%s :: sleeping %.2f for due to low run time' %
                            (skyline_app, sleep_for))
                sleep(sleep_for)
Esempio n. 15
0
# Consolidate flux logging
# logger = set_up_logging('listen')
logger = set_up_logging(None)

LOCAL_DEBUG = False

ALLOWED_CHARS = ['+', '-', '%', '.', '_', '/', '=']
for char in string.ascii_lowercase:
    ALLOWED_CHARS.append(char)
for char in string.ascii_uppercase:
    ALLOWED_CHARS.append(char)
for char in string.digits:
    ALLOWED_CHARS.append(char)

# @added 20201018 - Feature #3798: FLUX_PERSIST_QUEUE
redis_conn = get_redis_conn('flux')


def validate_key(caller, apikey):

    # @added 20200818 - Feature #3694: flux - POST multiple metrics
    # Added metric_namespace_prefix which is declared via the FLUX_API_KEYS
    metric_namespace_prefix = None

    try:
        isAlNum = False
        isAlNum = apikey.isalnum()
        if isAlNum:
            keyLength = len(apikey)
            if keyLength == 32:
                # Check to determine if it is a valid API key
Esempio n. 16
0
    def run(self):
        """
        - Called when the process intializes.

        - Determine if Redis is up

        - Spawn a process_metric process to do analysis

        - Wait for the process to finish.

        - run_every 300 seconds
        """

        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        now = time()
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = time()
            else:
                now = log_wait_for + 1

        logger.info('related_metrics :: starting')

        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except Exception as e:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: related_metrics cannot connect to redis at socket path %s - %s'
                    % (settings.REDIS_SOCKET_PATH, e))
                sleep(10)
                try:
                    self.redis_conn = get_redis_conn(skyline_app)
                    self.redis_conn_decoded = get_redis_conn_decoded(
                        skyline_app)
                except Exception as e:
                    logger.info(traceback.format_exc())
                    logger.error(
                        'error :: related_metrics cannot connect to get_redis_conn - %s'
                        % e)
                continue

            # Report app up
            try:
                self.redis_conn.setex('luminosity.related_metrics', 120, now)
                logger.info(
                    'related_metrics :: set luminosity.related_metrics Redis key'
                )
            except Exception as err:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: related_metrics :: could not update the Redis luminosity.related_metrics key - %s'
                    % str(err))

            now_timestamp = int(time())

            # Spawn process
            pids = []
            spawned_pids = []
            pid_count = 0
            for i in range(1, 1 + 1):
                try:
                    p = Process(target=self.find_related, args=(i, ))
                    pids.append(p)
                    pid_count += 1
                    logger.info(
                        'related_metrics starting %s of 1 find_related processes'
                        % (str(pid_count)))
                    p.start()
                    spawned_pids.append(p.pid)
                except Exception as e:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: related_metrics :: failed to spawn find_related_metrics process - %s'
                        % e)

            # Self monitor processes and terminate if any find_related
            # has run for longer than run_every - 10
            p_starts = time()
            while time() - p_starts <= (120 - 10):
                if any(p.is_alive() for p in pids):
                    # Just to avoid hogging the CPU
                    sleep(.1)
                else:
                    # All the processes are done, break now.
                    time_to_run = time() - p_starts
                    logger.info(
                        'related_metrics :: find_related process completed in %.2f seconds'
                        % (time_to_run))
                    break
            else:
                # We only enter this if we didn't 'break' above.
                logger.info(
                    'related_metrics :: timed out, killing find_related process'
                )
                for p in pids:
                    logger.info(
                        'related_metrics :: killing find_related process')
                    p.terminate()
                    logger.info(
                        'related_metrics :: killed find_related process')

            for p in pids:
                if p.is_alive():
                    try:
                        logger.info(
                            'related_metrics :: stopping find_related - %s' %
                            (str(p.is_alive())))
                        p.terminate()
                    except Exception as e:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: related_metrics :: failed to stop find_related - %s'
                            % e)

            run_every = 60
            process_runtime = time() - now
            if process_runtime < run_every:
                sleep_for = (run_every - process_runtime)

                process_runtime_now = time() - now
                sleep_for = (run_every - process_runtime_now)

                logger.info(
                    'related_metrics :: sleeping for %.2f seconds due to low run time...'
                    % sleep_for)
                sleep(sleep_for)
                try:
                    del sleep_for
                except Exception as e:
                    logger.error(
                        'error :: related_metrics :: failed to del sleep_for - %s'
                        % e)
            try:
                del process_runtime
            except Exception as e:
                logger.error(
                    'error :: related_metrics :: failed to del process_runtime - %s'
                    % e)
Esempio n. 17
0
def get_redis_metrics_timeseries(current_skyline_app, metrics, log=False):
    """
    Return a dict of metrics timeseries as lists e.g.
    {
        'base_name.1': [[ts, value], [ts, value], ..., [ts, value]],
        'base_name.2': [[ts, value], [ts, value], ..., [ts, value]]
    }

    :param current_skyline_app: the app calling the function
    :param metrics: a list of base_names or full Redis metric names
    :param log: whether to log or not, optional, defaults to False
    :type current_skyline_app: str
    :type metrics: list
    :type log: boolean
    :return: metrics_timeseries
    :rtype: dict

    """

    function_str = 'functions.redis.get_metrics_timeseries'
    if log:
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
    else:
        current_logger = None

    metrics_timeseries = {}
    try:
        redis_conn = get_redis_conn(current_skyline_app)
    except Exception as err:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(
            'error :: %s :: %s :: get_redis_conn failed - %s' %
            (current_skyline_app, function_str, str(err)))

    try:
        redis_conn_decoded = get_redis_conn_decoded(current_skyline_app)
    except Exception as err:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(
            'error :: %s :: %s :: get_redis_conn_decoded failed - %s' %
            (current_skyline_app, function_str, str(err)))

    assigned_metrics = []
    base_names = []
    for metric in metrics:
        if metric.startswith(FULL_NAMESPACE):
            metric_name = str(metric)
            base_name = metric.replace(FULL_NAMESPACE, '')
        else:
            metric_name = '%s%s' % (FULL_NAMESPACE, str(metric))
            base_name = str(metric)
        assigned_metrics.append(metric_name)
        base_names.append(base_name)
        metrics_timeseries[base_name] = {}

    derivative_metrics = []
    try:
        # @modified 20211012 - Feature #4280: aet.metrics_manager.derivative_metrics Redis hash
        # derivative_metrics = list(redis_conn_decoded.smembers('derivative_metrics'))
        derivative_metrics = list(
            redis_conn_decoded.smembers(
                'aet.metrics_manager.derivative_metrics'))
    except Exception as err:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: %s :: failed to get derivative_metrics from Redis - %s'
            % (current_skyline_app, function_str, str(err)))

    raw_assigned = {}
    try:
        raw_assigned = redis_conn.mget(assigned_metrics)
    except Exception as err:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error(
            'error :: %s :: %s :: failed to get raw_assigned from Redis - %s' %
            (current_skyline_app, function_str, str(err)))

    if raw_assigned:
        for index, metric_name in enumerate(assigned_metrics):
            timeseries = []
            try:
                raw_series = raw_assigned[index]
                if raw_series:
                    unpacker = Unpacker(use_list=False)
                    unpacker.feed(raw_series)
                    timeseries = list(unpacker)
            except Exception as err:
                if not log:
                    current_skyline_app_logger = current_skyline_app + 'Log'
                    current_logger = logging.getLogger(
                        current_skyline_app_logger)
                current_logger.error(
                    'error :: %s :: %s :: failed to unpack %s timeseries - %s'
                    %
                    (current_skyline_app, function_str, metric_name, str(err)))
                timeseries = []
            if timeseries:
                # Convert Redis ts floats to ints
                timeseries = [[int(ts), value] for ts, value in timeseries]
            if timeseries:
                # To ensure that there are no unordered timestamps in the time
                # series which are artefacts of the collector or carbon-relay, sort
                # all time series by timestamp before analysis.
                original_timeseries = timeseries
                if original_timeseries:
                    timeseries = sort_timeseries(original_timeseries)
                    del original_timeseries
                if metric_name in derivative_metrics:
                    if len(timeseries) > 3:
                        try:
                            derivative_timeseries = nonNegativeDerivative(
                                timeseries)
                            timeseries = derivative_timeseries
                        except Exception as err:
                            if not log:
                                current_skyline_app_logger = current_skyline_app + 'Log'
                                current_logger = logging.getLogger(
                                    current_skyline_app_logger)
                            current_logger.error(traceback.format_exc())
                            current_logger.error(
                                'error :: %s :: %s :: nonNegativeDerivative failed on timeseries for %s - %s'
                                % (current_skyline_app, function_str,
                                   metric_name, str(err)))
            if timeseries:
                base_name = base_names[index]
                metrics_timeseries[base_name] = timeseries

    return metrics_timeseries
Esempio n. 18
0
    def run(self):
        """
        Called when the process intializes.
        """
        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        # In Vista the log management is handled be fetcher, the worker just
        # waits for the fetcher to do the log managment
        now = int(time())
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = int(time())
            else:
                now = log_wait_for + 1

        logger.info('worker :: starting log management')
        if os.path.isfile(skyline_app_loglock):
            logger.error(
                'error :: worker :: bin/%s.d log management seems to have failed, continuing'
                % skyline_app)
            try:
                os_remove(skyline_app_loglock)
                logger.info('worker :: log lock file removed')
            except OSError:
                logger.error(
                    'error :: worker :: failed to remove %s, continuing' %
                    skyline_app_loglock)
                pass
        else:
            logger.info('worker :: bin/%s.d log management done' % skyline_app)

        logger.info('worker :: starting worker')

        try:
            VISTA_ENABLED = settings.VISTA_ENABLED
            logger.info('worker :: VISTA_ENABLED is set to %s' %
                        str(VISTA_ENABLED))
        except:
            VISTA_ENABLED = False
            logger.info(
                'worker :: warning :: VISTA_ENABLED is not declared in settings.py, defaults to False'
            )

        last_sent_to_graphite = int(time())
        metrics_sent_to_flux = 0

        # python-2.x and python3.x handle while 1 and while True differently
        # while 1:
        running = True
        while running:

            # Make sure Redis is up
            redis_up = False
            while not redis_up:
                try:
                    redis_up = self.redis_conn.ping()
                    if LOCAL_DEBUG:
                        logger.info('worker :: redis is up')
                except:
                    logger.error(
                        'worker :: cannot connect to redis at socket path %s' %
                        (settings.REDIS_SOCKET_PATH))
                    sleep(2)

                    # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings
                    #                      Branch #3262: py3
                    # if settings.REDIS_PASSWORD:
                    #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
                    # else:
                    #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                    self.redis_conn = get_redis_conn(skyline_app)
                    self.redis_conn_decoded = get_redis_conn_decoded(
                        skyline_app)

            metrics_data = []
            redis_set = 'vista.fetcher.metrics.json'
            try:
                # Get a metric to validate from the Redis set

                # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings
                #                      Branch #3262: py3
                # metrics_data = self.redis_conn.smembers(redis_set)
                metrics_data = self.redis_conn_decoded.smembers(redis_set)

                if LOCAL_DEBUG:
                    logger.info('worker :: got redis set data - %s' %
                                redis_set)
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: worker :: retrieving Redis set %s data' %
                    str(redis_set))

            if not metrics_data:
                if LOCAL_DEBUG:
                    logger.info('worker :: no data from Redis set %s' %
                                str(redis_set))
                sleep(5)

            for str_metric_data in metrics_data:
                delete_set_record = False
                remote_host_type = None
                try:

                    # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings
                    #                      Branch #3262: py3
                    # Rather using get_redis_conn_decoded
                    # if python_version == 3:
                    #     str_metric_data = str_metric_data.decode('UTF-8')

                    metric_data = literal_eval(str_metric_data)
                    remote_host_type = str(metric_data[0]['remote_host_type'])
                    if LOCAL_DEBUG:
                        logger.info(
                            'worker :: got data from Redis set for remote_host_type %s'
                            % str(remote_host_type))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to determine remote_host_type from %s'
                        % str(str_metric_data))
                    delete_set_record = True
                if not delete_set_record:
                    try:
                        remote_target = str(metric_data[0]['remote_target'])
                        if LOCAL_DEBUG:
                            logger.info(
                                'worker :: got data from Redis set for target %s'
                                % str(remote_target))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to determine target from %s'
                            % str(str_metric_data))
                        delete_set_record = True
                metric = None
                if not delete_set_record:
                    try:
                        metric = str(metric_data[0]['metric'])
                        if LOCAL_DEBUG:
                            logger.info(
                                'worker :: got data from Redis set for metric %s'
                                % str(metric))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to determine metric from %s'
                            % str(str_metric_data))
                        delete_set_record = True

                namespace_prefix = ''
                if not delete_set_record:
                    try:
                        namespace_prefix = str(
                            metric_data[0]['namespace_prefix'])
                        namespace_prefix = '%s.' % namespace_prefix
                        if not namespace_prefix:
                            namespace_prefix = ''
                        if LOCAL_DEBUG:
                            logger.info(
                                'worker :: got data from Redis set for namespace_prefix %s'
                                % str(namespace_prefix))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to determine namespace_prefix from %s'
                            % str(str_metric_data))
                        delete_set_record = True

                have_data = False
                if not delete_set_record:
                    last_flux_metric_data = None
                    cache_key = 'flux.last.%s' % (metric)
                    try:
                        if python_version == 3:
                            redis_last_flux_metric_data = self.redis_conn.get(
                                cache_key).decode('UTF-8')
                        else:
                            redis_last_flux_metric_data = self.redis_conn.get(
                                cache_key)
                        redis_last_flux_metric_data = redis_last_flux_metric_data
                        last_flux_metric_data = literal_eval(
                            redis_last_flux_metric_data)
                        if LOCAL_DEBUG:
                            logger.info(
                                'worker :: got last_flux_metric_data from Redis'
                            )
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: retrieving Redis key %s data' %
                            str(cache_key))
                        last_flux_metric_data = False

                    last_flux_timestamp = None
                    if last_flux_metric_data:
                        try:
                            last_flux_timestamp = int(last_flux_metric_data[0])
                            if LOCAL_DEBUG:
                                logger.info(
                                    'worker :: got last_flux_timestamp - %s' %
                                    str(last_flux_timestamp))
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed determining last_flux_timestamp'
                            )
                            last_flux_timestamp = False

                    # Determine the timestamp of the current minute to apply
                    # VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE
                    time_now = int(time())
                    # current_minute = datetime.datetime.utcfromtimestamp(time_now).strftime('%Y-%m-%d %H:%M')
                    current_minute_hour = int(
                        datetime.datetime.utcfromtimestamp(time_now).strftime(
                            '%H'))
                    current_minute_minute = int(
                        datetime.datetime.utcfromtimestamp(time_now).strftime(
                            '%M'))
                    current_datetime = datetime.datetime.utcfromtimestamp(
                        time_now).replace(hour=current_minute_hour,
                                          minute=current_minute_minute,
                                          second=0,
                                          microsecond=0)
                    current_minute_timestamp_start = int(
                        current_datetime.strftime('%s'))

                    datapoint = None
                    last_timestamp_with_data = None
                    timeseries = []

                    # @added 20200107 - Task #3376: Enable vista and flux to deal with lower frequency data
                    metric_resolution = 60
                    metric_resolution_determined = False

                    try:
                        if python_version == 3:
                            datapoints_str = literal_eval(
                                metric_data[0]['datapoints'])
                            metric_datapoints = literal_eval(datapoints_str)
                        else:
                            # metric_datapoints = metric_data[0]['datapoints']
                            datapoints_str = literal_eval(
                                metric_data[0]['datapoints'])
                            metric_datapoints = literal_eval(datapoints_str)
                        # for value, timestamp in metric_data[0]['datapoints']:
                        if LOCAL_DEBUG:
                            len_metric_datapoints = len(metric_datapoints)
                            logger.info(
                                'worker :: got %s metric_datapoints - %s' %
                                (str(len_metric_datapoints),
                                 str(metric_datapoints)))

                        # @added 20200107 - Task #3376: Enable vista and flux to deal with lower frequency data
                        # Determine resolution
                        resolution_timestamps = []
                        for metric_datapoint in metric_datapoints:
                            timestamp = int(metric_datapoint[0])
                            resolution_timestamps.append(timestamp)
                        timestamp_resolutions = []
                        if resolution_timestamps:
                            last_timestamp = None
                            for timestamp in resolution_timestamps:
                                if last_timestamp:
                                    resolution = timestamp - last_timestamp
                                    timestamp_resolutions.append(resolution)
                                    last_timestamp = timestamp
                                else:
                                    last_timestamp = timestamp
                        if timestamp_resolutions:
                            try:
                                timestamp_resolutions_count = Counter(
                                    timestamp_resolutions)
                                ordered_timestamp_resolutions_count = timestamp_resolutions_count.most_common(
                                )
                                metric_resolution = int(
                                    ordered_timestamp_resolutions_count[0][0])
                                if metric_resolution > 0:
                                    metric_resolution_determined = True
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: worker :: failed to determine metric_resolution from %s'
                                    % (str(metric_data)))
                        if metric_resolution_determined:
                            cache_key = 'vista.last.resolution.%s' % metric
                            try:
                                # Update Redis key
                                self.redis_conn.setex(cache_key, 3600,
                                                      metric_resolution)
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: fetcher :: failed to set Redis key - %s'
                                    % (cache_key))

                        for metric_datapoint in metric_datapoints:
                            # @20191010 - Branch #3140: vista
                            # fetcher passes through preformatted data points that
                            # are in the same format/order for both graphite and
                            # prometheus
                            # if remote_host_type == 'graphite':
                            #     value = float(metric_datapoint[0])
                            #     timestamp = int(metric_datapoint[1])
                            # if remote_host_type == 'prometheus':
                            #     value = float(metric_datapoint[1])
                            #     timestamp = int(metric_datapoint[0])
                            timestamp = int(metric_datapoint[0])
                            value = float(metric_datapoint[1])

                            append_to_timeseries = False
                            if last_flux_timestamp:
                                if int(timestamp) > last_flux_timestamp:
                                    # timeseries.append([timestamp, value])
                                    append_to_timeseries = True
                            else:
                                # timeseries.append([timestamp, value])
                                append_to_timeseries = True

                            # Here if the timestamp of the data point falls
                            # within the current minute, it is discarded and not
                            # sent to flux, to ensure that high frequency metrics
                            # can have their minutely bins fully populated before
                            # they are submitted to Graphite
                            if settings.VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE:
                                if int(timestamp
                                       ) >= current_minute_timestamp_start:
                                    append_to_timeseries = False
                            if append_to_timeseries:
                                timeseries.append([timestamp, value])

                        last_timestamp_with_data = 0
                        for timestamp, value in timeseries[::-1]:
                            has_value = False
                            if value == 0.0:
                                has_value = True
                            if value:
                                has_value = True
                            if has_value:
                                last_timestamp_with_data = int(timestamp)
                                datapoint = value
                                break
                        if last_timestamp_with_data:
                            have_data = True
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to determine datapoints from %s'
                            % (str(metric_data)))
                        delete_set_record = True
                if not timeseries:
                    logger.info(
                        'worker :: after processing, there were no valid data points in %s'
                        % (str(metric_data)))
                    delete_set_record = True
                if not have_data and timeseries:
                    logger.error(
                        'error :: worker :: failed to determine last_timestamp_with_data from %s'
                        % (str(metric_data)))
                    delete_set_record = True
                if delete_set_record:
                    try:
                        redis_set = 'vista.fetcher.metrics.json'
                        self.redis_conn.srem(redis_set, str_metric_data)
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to delete data from Redis set %s, data - '
                            % (str(redis_set), str(str_metric_data)))
                    continue

                if not metric:
                    continue

                valid_data = True
                if last_flux_timestamp and last_timestamp_with_data:
                    if int(last_timestamp_with_data) <= last_flux_timestamp:
                        valid_data = False
                if not valid_data:
                    redis_set = 'vista.fetcher.metrics.json'
                    logger.info(
                        'worker :: no valid data in fetched data removing from Redis set %s - data - %s'
                        % (redis_set, str(str_metric_data)))
                    try:
                        self.redis_conn.srem(redis_set, str_metric_data)
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to delete data from Redis set %s, data - %s'
                            % (redis_set, str(str_metric_data)))
                    continue

                if valid_data:
                    flux_host = 'http://%s:%s' % (settings.FLUX_IP,
                                                  settings.FLUX_PORT)

                    # Resample
                    resample_at = None
                    if resample_at == 'none' or resample_at == '0Min':
                        resample_at = False
                    if resample_at == 'None' or resample_at == '0min':
                        resample_at = False
                    if resample_at is None or resample_at == '0' or resample_at == 0:
                        resample_at = False
                    if resample_at:
                        try:
                            df = pd.DataFrame(timeseries)
                            df.columns = ['timestamp', 'value']
                            df['timestamp'] = pd.to_datetime(df['timestamp'],
                                                             unit='s',
                                                             origin='unix')
                            df = df.set_index('timestamp')
                            resampled_df = df.resample(resample_at).sum()
                            resampled_timeseries = []
                            for index, row in resampled_df.iterrows():
                                timestamp = int(index.strftime('%s'))
                                resampled_timeseries.append(
                                    [timestamp, row[0]])
                            timeseries = resampled_timeseries
                            timeseries_length = len(timeseries)
                            logger.info(
                                'worker :: time series resampled at %s resulting in %s data points to send to Graphite'
                                % (str(resample_at), str(timeseries_length)))
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to resample time series at %s for %s with time series %s'
                                % (str(resample_at), str(metric),
                                   str(timeseries)))

                    for timestamp, value in timeseries:
                        flux_url = '%s/metric_data?metric=%s&value=%s&timestamp=%s&key=%s' % (
                            flux_host, metric, str(datapoint), str(timestamp),
                            settings.FLUX_SELF_API_KEY)
                        success = False
                        try:
                            response = requests.get(flux_url)
                            if response.status_code == 200:
                                success = True
                            elif response.status_code == 204:
                                success = True
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to request %s' %
                                str(flux_url))
                        if not success:
                            logger.error(
                                'error :: worker :: http status code - %s, reason - %s'
                                % (str(response.status_code),
                                   str(response.reason)))

                    if success:
                        metrics_sent_to_flux += 1
                        redis_set = 'vista.fetcher.metrics.json'

                        # @added 20191011 - Task #3258: Reduce vista logging
                        timeseries_length = len(timeseries)

                        # @modified 20191011 - Task #3258: Reduce vista logging
                        # logger.info('worker :: data submitted to flux OK, removing data from Redis set %s' % (
                        #     redis_set))
                        logger.info(
                            'worker :: %s data points submitted to flux OK for %s'
                            % (str(timeseries_length), metric))
                        try:
                            self.redis_conn.srem(redis_set, str_metric_data)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to delete data from Redis set %s, data - %s'
                                % (redis_set, str(str_metric_data)))

                        redis_set = 'vista.fetcher.unique_metrics'
                        try:
                            self.redis_conn.sadd(redis_set, remote_target)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to add %s to Redis set %s'
                                % (remote_target, redis_set))

            time_now = int(time())
            if (time_now - last_sent_to_graphite) >= 60:
                logger.info(
                    'worker :: metrics sent_to_flux in last 60 seconds - %s' %
                    str(metrics_sent_to_flux))
                send_metric_name = '%s.metrics_sent_to_flux' % skyline_app_graphite_namespace
                try:
                    send_graphite_metric(parent_skyline_app, send_metric_name,
                                         str(metrics_sent_to_flux))
                    last_sent_to_graphite = int(time())
                    metrics_sent_to_flux = 0
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to send_graphite_metric %s with %s'
                        % (send_metric_name, str(metrics_sent_to_flux)))
Esempio n. 19
0
def run_selected_algorithm(timeseries, metric_name, airgapped_metrics,
                           airgapped_metrics_filled, run_negatives_present,
                           check_for_airgaps_only):
    """
    Filter timeseries and run selected algorithm.
    """

    # @added 20180807 - Feature #2492: alert on stale metrics
    # Determine if a metric has stopped sending data and if so add to the
    # analyzer.alert_on_stale_metrics Redis set
    add_to_alert_on_stale_metrics = False
    if ALERT_ON_STALE_METRICS:
        # @modified 20180816 - Feature #2492: alert on stale metrics
        # Added try and except to prevent some errors that are encounter between
        # 00:14 and 00:17 on some days
        # Traceback (most recent call last):
        # File "/opt/skyline/github/skyline/skyline/analyzer/analyzer.py", line 394, in spin_process
        # anomalous, ensemble, datapoint = run_selected_algorithm(timeseries, metric_name)
        # File "/opt/skyline/github/skyline/skyline/analyzer/algorithms.py", line 530, in run_selected_algorithm
        # if int(time()) - int(timeseries[-1][0]) >= ALERT_ON_STALE_PERIOD:
        # IndexError: list index out of range
        try:
            if int(time()) - int(timeseries[-1][0]) >= ALERT_ON_STALE_PERIOD:
                add_to_alert_on_stale_metrics = True
        except:
            # @modified 20180816 -
            #                      Feature #2492: alert on stale metrics
            add_to_alert_on_stale_metrics = False
        try:
            if int(time()) - int(timeseries[-1][0]) >= STALE_PERIOD:
                add_to_alert_on_stale_metrics = False
        except:
            add_to_alert_on_stale_metrics = False

        if add_to_alert_on_stale_metrics:
            try:
                # @added 20200505 - Feature #3504: Handle airgaps in batch metrics
                # Use get_redis_conn
                from skyline_functions import get_redis_conn
                redis_conn = get_redis_conn(skyline_app)
                redis_conn.sadd('analyzer.alert_on_stale_metrics', metric_name)
            except:
                pass

    # @added 20200505 - Feature #3504: Handle airgaps in batch metrics
    # Check to see if this is a batch processing metric that has been sent
    # through Analyzer to check for airgaps only and if so do not check the
    # timeseries for exceptions
    check_for_timeseries_exceptions = True
    check_airgap_only = None
    if BATCH_PROCESSING and check_for_airgaps_only:
        check_airgap_only_key = 'analyzer.check_airgap_only.%s' % metric_name
        try:
            if not add_to_alert_on_stale_metrics:
                # @added 20200505 - Feature #3504: Handle airgaps in batch metrics
                # Use get_redis_conn
                from skyline_functions import get_redis_conn
                redis_conn = get_redis_conn(skyline_app)
            check_airgap_only = redis_conn.get(check_airgap_only_key)
        except:
            check_airgap_only = None
        if check_airgap_only:
            check_for_timeseries_exceptions = False

    # @modified 20200505 - Feature #3504: Handle airgaps in batch metrics
    # Wrapped in check_for_timeseries_exceptions as if it is a check_airgap_only
    # metric then the time series should not be checked for exceptions
    if check_for_timeseries_exceptions:
        # Get rid of short series
        if len(timeseries) < MIN_TOLERABLE_LENGTH:
            raise TooShort()

        # Get rid of stale series
        if time() - timeseries[-1][0] > STALE_PERIOD:
            raise Stale()

        # Get rid of boring series
        if len(set(item[1] for item in
                   timeseries[-MAX_TOLERABLE_BOREDOM:])) == BOREDOM_SET_SIZE:
            raise Boring()

    # @added 20200423 - Feature #3508: ionosphere.untrainable_metrics
    # Added run_negatives_present
    negatives_found = False

    # @added 20200117 - Feature #3400: Identify air gaps in the metric data
    # @modified 20200214 - Bug #3448: Repeated airgapped_metrics
    #                      Feature #3400: Identify air gaps in the metric data
    # if IDENTIFY_AIRGAPS:
    if IDENTIFY_AIRGAPS or IDENTIFY_UNORDERED_TIMESERIES:
        # airgaps = identify_airgaps(metric_name, timeseries, airgapped_metrics)
        # if airgaps:
        process_metric = True
        if IDENTIFY_AIRGAPS:
            if CHECK_AIRGAPS:
                process_metric = False

                # @added 20200423 - Feature #3504: Handle airgaps in batch metrics
                #                   Feature #3400: Identify air gaps in the metric data
                # Replaced code block below to determine if a metric is a check
                # with a skyline_functions definition of that block as
                # the check_metric_for_airgaps function
                check_metric_for_airgaps = False
                try:
                    check_metric_for_airgaps = is_check_airgap_metric(
                        metric_name)
                except:
                    check_metric_for_airgaps = False
                    try:
                        logger.error(
                            'failed to determine if %s is an airgap metric: %s'
                            % (str(metric_name), traceback.format_exc()))
                    except:
                        logger.error(
                            'failed to determine if the metric is an airgap metric'
                        )
                if check_metric_for_airgaps:
                    process_metric = True
        else:
            # If IDENTIFY_AIRGAPS is not enabled and
            # IDENTIFY_UNORDERED_TIMESERIES is enabled process the metric
            if IDENTIFY_UNORDERED_TIMESERIES:
                process_metric = True
        airgaps = None
        unordered_timeseries = False
        if process_metric:
            # @modified 20200501 - Feature #3400: Identify air gaps in the metric data
            # Added airgapped_metrics_filled
            # airgaps, unordered_timeseries = identify_airgaps(metric_name, timeseries, airgapped_metrics)
            airgaps, unordered_timeseries = identify_airgaps(
                metric_name, timeseries, airgapped_metrics,
                airgapped_metrics_filled)
        if airgaps or unordered_timeseries:
            try:
                redis_conn.ping()
            except:
                # @added 20200505 - Feature #3504: Handle airgaps in batch metrics
                # Use get_redis_conn
                from skyline_functions import get_redis_conn
                redis_conn = get_redis_conn(skyline_app)
        if airgaps:
            for i in airgaps:
                try:
                    redis_conn.sadd('analyzer.airgapped_metrics', str(i))
                    logger.info('adding airgap %s' % str(i))
                    # TODO: learn_airgapped_metrics
                except:
                    pass
            del airgaps

        # @added 20200214 - Bug #3448: Repeated airgapped_metrics
        #                   Feature #3400: Identify air gaps in the metric data
        # Also add unordered time series to the analyzer.unordered_timeseries
        # Redis set
        if unordered_timeseries:
            try:
                redis_conn.sadd('analyzer.unordered_timeseries', metric_name)
                del unorder_timeseries
            except:
                pass

    # @added 20200423 - Feature #3504: Handle airgaps in batch metrics
    #                   Feature #3480: batch_processing
    #                   Feature #3486: analyzer_batch
    #                   Feature #3400: Identify air gaps in the metric data
    # Check to see if this is a batch processing metric that has been sent to
    # analyzer_batch for processing but sent through Analyzer to check for
    # airgaps only and if so return as it should not be run through algorithms
    if BATCH_PROCESSING:
        if check_airgap_only:
            try:
                redis_conn.delete(check_airgap_only_key)
            except:
                try:
                    logger.error(
                        'failed to delete Redis key %s: %s' %
                        (str(check_airgap_only_key), traceback.format_exc()))
                except:
                    logger.error(
                        'failed to failure regarding deleting the check_airgap_only_key Redis key'
                    )
            # @modified 20200430 - Feature #3480: batch_processing
            # Tidy up and reduce logging, only log if debug enabled
            if BATCH_PROCESSING_DEBUG:
                logger.info(
                    'algorithms :: batch processing - batch metric %s checked for airgaps only, not analysing'
                    % (str(metric_name)))

            # TODO: the only worry here is that this metric then gets added to
            # the not_anomalous Redis set?  Not sure if that is a problem, I do
            # not think it is.  Unless it is in the end of anomaly_end_timestamp
            # context?
            # @modified 20200424 - Feature #3508: ionosphere.untrainable_metrics
            # Added negatives_found
            return False, [], 1, negatives_found

    # RUN_OPTIMIZED_WORKFLOW - replaces the original ensemble method:
    # ensemble = [globals()[algorithm](timeseries) for algorithm in ALGORITHMS]
    # which runs all timeseries through all ALGORITHMS
    final_ensemble = []
    number_of_algorithms_triggered = 0
    number_of_algorithms_run = 0
    number_of_algorithms = len(ALGORITHMS)
    maximum_false_count = number_of_algorithms - CONSENSUS + 1
    # logger.info('the maximum_false_count is %s, above which CONSENSUS cannot be achieved' % (str(maximum_false_count)))
    consensus_possible = True
    # DEVELOPMENT: this is for a development version of analyzer only
    if skyline_app == 'analyzer_dev':
        time_all_algorithms = True
    else:
        time_all_algorithms = False

    algorithm_tmp_file_prefix = '%s/%s.' % (SKYLINE_TMP_DIR, skyline_app)

    for algorithm in ALGORITHMS:
        if consensus_possible:

            if send_algorithm_run_metrics:
                algorithm_count_file = '%s%s.count' % (
                    algorithm_tmp_file_prefix, algorithm)
                algorithm_timings_file = '%s%s.timings' % (
                    algorithm_tmp_file_prefix, algorithm)

            run_algorithm = []
            run_algorithm.append(algorithm)
            number_of_algorithms_run += 1
            if send_algorithm_run_metrics:
                start = timer()
            try:
                algorithm_result = [
                    globals()[test_algorithm](timeseries)
                    for test_algorithm in run_algorithm
                ]
            except:
                # logger.error('%s failed' % (algorithm))
                algorithm_result = [None]

            if send_algorithm_run_metrics:
                end = timer()
                with open(algorithm_count_file, 'a') as f:
                    f.write('1\n')
                with open(algorithm_timings_file, 'a') as f:
                    f.write('%.6f\n' % (end - start))
        else:
            algorithm_result = [False]
            # logger.info('CONSENSUS NOT ACHIEVABLE - skipping %s' % (str(algorithm)))

        if algorithm_result.count(True) == 1:
            result = True
            number_of_algorithms_triggered += 1
            # logger.info('algorithm %s triggerred' % (str(algorithm)))
        elif algorithm_result.count(False) == 1:
            result = False
        elif algorithm_result.count(None) == 1:
            result = None
        else:
            result = False

        final_ensemble.append(result)

        if not RUN_OPTIMIZED_WORKFLOW:
            continue

        if time_all_algorithms:
            continue

        if ENABLE_ALL_ALGORITHMS_RUN_METRICS:
            continue

        # true_count = final_ensemble.count(True)
        # false_count = final_ensemble.count(False)
        # logger.info('current false_count %s' % (str(false_count)))

        if final_ensemble.count(False) >= maximum_false_count:
            consensus_possible = False
            # logger.info('CONSENSUS cannot be reached as %s algorithms have already not been triggered' % (str(false_count)))
            # skip_algorithms_count = number_of_algorithms - number_of_algorithms_run
            # logger.info('skipping %s algorithms' % (str(skip_algorithms_count)))

    # logger.info('final_ensemble: %s' % (str(final_ensemble)))

    try:
        # ensemble = [globals()[algorithm](timeseries) for algorithm in ALGORITHMS]
        ensemble = final_ensemble

        threshold = len(ensemble) - CONSENSUS
        if ensemble.count(False) <= threshold:

            # @added 20200425 - Feature #3508: ionosphere.untrainable_metrics
            # Only run a negatives_present check if it is anomalous, there
            # is no need to check unless it is related to an anomaly
            if run_negatives_present:
                try:
                    negatives_found = negatives_present(timeseries)
                except:
                    logger.error('Algorithm error: negatives_present :: %s' %
                                 traceback.format_exc())
                    negatives_found = False

            if ENABLE_SECOND_ORDER:
                if is_anomalously_anomalous(metric_name, ensemble,
                                            timeseries[-1][1]):
                    # @modified 20200423 - Feature #3508: ionosphere.untrainable_metrics
                    # Added negatives_found
                    return True, ensemble, timeseries[-1][1], negatives_found
            else:
                return True, ensemble, timeseries[-1][1], negatives_found

        # @modified 20200423 - Feature #3508: ionosphere.untrainable_metrics
        # Added negatives_found
        return False, ensemble, timeseries[-1][1], negatives_found
    except:
        logger.error('Algorithm error: %s' % traceback.format_exc())
        # @modified 20200423 - Feature #3508: ionosphere.untrainable_metrics
        # Added negatives_found
        return False, [], 1, negatives_found
Esempio n. 20
0
    def run(self):
        """
        Called when the process intializes.
        """
        def pickle_data_to_graphite(data):

            message = None
            try:
                payload = pickle.dumps(data, protocol=2)
                header = struct.pack("!L", len(payload))
                message = header + payload
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: populate_metric_worker :: failed to pickle to send to Graphite'
                )
                return False
            if message:
                try:
                    sock = socket.socket()
                    sock.connect((CARBON_HOST, FLUX_CARBON_PICKLE_PORT))
                    sock.sendall(message)
                    sock.close()
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to send pickle data to Graphite'
                    )
                    return False
            else:
                logger.error(
                    'error :: populate_metric_worker :: failed to pickle metric data into message'
                )
                return False
            return True

        logger.info('populate_metric_worker :: starting worker')

        # Populate API keys and tokens in memcache
        # python-2.x and python3.x handle while 1 and while True differently
        # while 1:
        running = True
        while running:
            # Make sure Redis is up
            redis_up = False
            while not redis_up:
                try:
                    redis_up = self.redis_conn.ping()
                except:
                    logger.error(
                        'populate_metric_worker :: cannot connect to Redis at socket path %s'
                        % (settings.REDIS_SOCKET_PATH))
                    sleep(2)

                    # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings
                    #                      Branch #3262: py3
                    # if settings.REDIS_PASSWORD:
                    #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
                    # else:
                    #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                    self.redis_conn = get_redis_conn(skyline_app)
                    # @added 20191128 - Bug #3266: py3 Redis binary objects not strings
                    #                   Branch #3262: py3
                    self.redis_conn_decoded = get_redis_conn_decoded(
                        skyline_app)

            metricDict = None
            try:
                # Get a metric from the queue with a 1 second timeout, each
                # metric item on the queue is a list e.g.
                # metric_json = [metricName, metricValue, metricTimestamp]
                metricDict = self.q.get(True, 1)
                logger.info('populate_metric_worker :: processing queue item')
            except Empty:
                logger.info(
                    'populate_metric_worker :: queue is empty and timed out, sleeping for 30 seconds'
                )
                sleep(30)
            except NotImplementedError:
                pass
            except KeyboardInterrupt:
                logger.info(
                    'populate_metric_worker :: server has been issued a user signal to terminate - KeyboardInterrupt'
                )
            except SystemExit:
                logger.info(
                    'populate_metric_worker :: server was interrupted - SystemExit'
                )
            except Exception as e:
                logger.error('error :: populate_metric_worker :: %s' %
                             (str(e)))

            if not metricDict:
                continue

            try:
                remote_host_type = str(metricDict['remote_host_type'])
                remote_target = str(metricDict['remote_target'])
                metric = str(metricDict['metric'])
                namespace_prefix = str(metricDict['namespace_prefix'])
                if not namespace_prefix:
                    namespace_prefix = ''
                if namespace_prefix == 'None':
                    namespace_prefix = ''
                key = str(metricDict['key'])
                token = str(metricDict['token'])
                user = str(metricDict['user'])
                password = str(metricDict['password'])
                if metricDict['fetch_resolution_urls'] == 'None':
                    logger.info(
                        'No fetch_resolution_urls declared for %s, nothing to do'
                        % remote_target)
                    continue
                if metricDict['fetch_resolution_urls'] == '()' or metricDict[
                        'fetch_resolution_urls'] == ():
                    logger.info(
                        'No fetch_resolution_urls declared for %s, nothing to do'
                        % remote_target)
                    continue

                fetch_resolution_urls_str = literal_eval(
                    metricDict['fetch_resolution_urls'])
                fetch_resolution_urls = literal_eval(fetch_resolution_urls_str)
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: populate_metric_worker :: failed to read from metricData'
                )

            if LOCAL_DEBUG:
                try:
                    logger.info(
                        'populate_metric_worker :: remote_target from metricData set to %s'
                        % remote_target)
                    logger.info(
                        'populate_metric_worker :: metric from metricData set to %s'
                        % metric)
                    logger.info(
                        'populate_metric_worker :: namespace_prefix from metricData set to %s'
                        % namespace_prefix)
                    logger.info(
                        'populate_metric_worker :: key from metricData set to %s'
                        % key)
                    logger.info(
                        'populate_metric_worker :: token from metricData set to %s'
                        % token)
                    logger.info(
                        'populate_metric_worker :: user from metricData set to %s'
                        % user)
                    logger.info(
                        'populate_metric_worker :: password from metricData set to %s'
                        % password)
                    logger.info(
                        'populate_metric_worker :: fetch_resolution_urls from metricData set to %s'
                        % str(fetch_resolution_urls))
                    if fetch_resolution_urls:
                        for fetch_url in fetch_resolution_urls:
                            logger.info(
                                'populate_metric_worker :: a fetch_url from metricData is set to %s'
                                % str(fetch_url))
                    logger.info(
                        'populate_metric_worker :: metric is set to %s' %
                        metric)
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to read from metricData'
                    )

            # Best effort to de-duplicate the data sent to Graphite
            cache_key = 'flux.last.%s' % metric
            last_flux_timestamp = None
            try:
                # @modified 20191128 - Bug #3266: py3 Redis binary objects not strings
                #                      Branch #3262: py3
                # redis_last_metric_data = self.redis_conn.get(cache_key).decode('utf-8')
                redis_last_metric_data = self.redis_conn_decoded.get(cache_key)
                last_metric_data = literal_eval(redis_last_metric_data)
                last_flux_timestamp = int(last_metric_data[0])
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: populate_metric_worker :: failed to determine last_flux_timestamp from Redis key %s'
                    % cache_key)
                last_flux_timestamp = False
            recent_last_flux_timestamp_present = False
            if last_flux_timestamp:
                now = int(time())
                if (now - last_flux_timestamp) < 600:
                    recent_last_flux_timestamp_present = True
                    # Skyline has the metric so adding it to the vista.fetcher
                    # Redis set
                    redis_set = 'vista.fetcher.unique_metrics'
                    data = str(remote_target)
                    try:
                        self.redis_conn.sadd(redis_set, data)
                        logger.info(
                            'populate_metric_worker :: the last flux update for %s was less than 600 seconds ago, added metric to %s'
                            % (metric, redis_set))
                    except:
                        logger.info(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: failed to add %s to Redis set %s'
                            % (str(data), str(redis_set)))
                    # continue

            if not last_flux_timestamp:
                # Check Graphite does not have the data or determine what the
                # last data Graphite has is
                logger.info(
                    'populate_metric_worker :: no last_flux_timestamp was found in Redis for %s, checking if Graphite has data'
                    % (metric))
                check_graphite_from = [
                    '-50mins', '-6hours', '-24hours', '-7days', '-30days',
                    '-90days'
                ]
                timeseries = []
                for graphite_from in check_graphite_from:
                    if last_flux_timestamp:
                        break
                    logger.info(
                        'populate_metric_worker :: checking %s in Graphite from %s'
                        % (metric, graphite_from))
                    got_data = False
                    try:
                        # We use absolute time so that if there is a lag in mirage the correct
                        # timeseries data is still surfaced relevant to the anomalous datapoint
                        # timestamp
                        if settings.GRAPHITE_PORT != '':
                            url = '%s://%s:%s/%s/?from=%s&target=%s&format=json' % (
                                settings.GRAPHITE_PROTOCOL,
                                settings.GRAPHITE_HOST,
                                str(settings.GRAPHITE_PORT),
                                settings.GRAPHITE_RENDER_URI, graphite_from,
                                metric)
                        else:
                            url = '%s://%s/%s/?from=%s&target=%s&format=json' % (
                                settings.GRAPHITE_PROTOCOL,
                                settings.GRAPHITE_HOST,
                                settings.GRAPHITE_RENDER_URI, graphite_from,
                                metric)
                        logger.info(
                            'populate_metric_worker :: using Graphite URL - %s'
                            % (url))
                        r = requests.get(url)
                        if r.status_code == 200:
                            js = []
                            try:
                                js = r.json()
                            except:
                                logger.info(traceback.format_exc())
                                logger.error(
                                    'error :: populate_metric_worker :: failed to get data from Graphite'
                                )
                                continue
                            if not js:
                                logger.info(
                                    'populate_metric_worker :: %s not present in Graphite from %s'
                                    % (metric, graphite_from))
                                continue
                            got_data = True
                            logger.info(
                                'populate_metric_worker :: %s present in Graphite from %s'
                                % (metric, graphite_from))
                        else:
                            logger.info(
                                'populate_metric_worker :: %s not present in Graphite from %s'
                                % (metric, graphite_from))
                            continue
                    except:
                        logger.info(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: failed to get data from Graphite'
                        )
                        continue

                    datapoints = []
                    if got_data:
                        try:
                            js = r.json()
                            datapoints = js[0]['datapoints']
                            logger.info(
                                'populate_metric_worker :: %s data points are present in the Graphite %s data'
                                % (str(len(datapoints)), str(graphite_from)))
                        except:
                            logger.info(traceback.format_exc())
                            logger.error(
                                'error :: populate_metric_worker :: failed to get data from Graphite'
                            )

                    for datapoint in datapoints:
                        try:
                            value = float(datapoint[0])
                            timestamp = int(datapoint[1])
                            new_datapoint = [timestamp, value]
                            timeseries.append(new_datapoint)
                        except:  # nosec
                            continue
                    last_timestamp_with_data = None
                    for timestamp, value in timeseries[::-1]:
                        has_value = False
                        if value == 0.0:
                            has_value = True
                        if value == 0:
                            has_value = True
                        if value:
                            has_value = True
                        if has_value:
                            last_timestamp_with_data = int(timestamp)
                            datapoint = value
                            break
                    if last_timestamp_with_data:
                        # Here we set this as the missing last_flux_timestamp
                        last_flux_timestamp = last_timestamp_with_data
                        recent_last_flux_timestamp_present = True
                        logger.info(
                            'populate_metric_worker :: %s last timestamp in Graphite from %s is %s, using as last_flux_timestamp'
                            % (metric, str(graphite_from),
                               str(last_flux_timestamp)))

            timeseries = []
            start_populating = int(time())
            datapoints_added_to_timeseries = 0
            datapoints_already_populated = 0
            datapoints_with_no_value = 0
            timestamp = None
            value = None

            # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints
            # And set flux.last key is the returned value from the remote is
            # null so that time series that are mostly null do not keep on
            # getting added to flux populate_metric by Vista
            raw_timeseries = []

            for fetch_url in fetch_resolution_urls:
                # if recent_last_flux_timestamp_present and remote_host_type == 'prometheus':
                # This was for the query query and resample method and not for
                # the query_range query
                if recent_last_flux_timestamp_present and remote_host_type == 'prometheus_query_range_NOT_FOR_GE_11000':
                    try:
                        logger.info(
                            'populate_metric_worker :: recent data so replacing fetch_url %s '
                            % (fetch_url))
                        seconds_to_fetch = int(time()) - last_flux_timestamp
                        minutes_to_fetch = int(seconds_to_fetch / 60) + 2
                        re_mins_to_fetch = '[%sm]' % str(minutes_to_fetch)
                        fetch_url = re.sub(r'\[.*\]', re_mins_to_fetch,
                                           fetch_url)
                        encoded_re_mins_to_fetch = '%%5B%sm%%5D' % str(
                            minutes_to_fetch)
                        fetch_url = re.sub(r'%5B.*%5D',
                                           encoded_re_mins_to_fetch, fetch_url)
                        logger.info(
                            'populate_metric_worker :: replaced fetch_url %s '
                            % (fetch_url))
                    except:
                        logger.info(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: failed to rewrite URL'
                        )

                if recent_last_flux_timestamp_present and remote_host_type == 'prometheus':
                    try:
                        logger.info(
                            'populate_metric_worker :: recent data so replacing fetch_url %s '
                            % (fetch_url))
                        seconds_to_fetch = int(time()) - last_flux_timestamp
                        minutes_to_fetch = int(seconds_to_fetch / 60) + 2
                        re_mins_to_fetch = '[%sm]' % str(minutes_to_fetch)
                        fetch_url = re.sub(r'\[.*\]', re_mins_to_fetch,
                                           fetch_url)
                        encoded_re_mins_to_fetch = '%%5B%sm%%5D' % str(
                            minutes_to_fetch)
                        fetch_url = re.sub(r'%5B.*%5D',
                                           encoded_re_mins_to_fetch, fetch_url)
                        logger.info(
                            'populate_metric_worker :: replaced fetch_url %s '
                            % (fetch_url))
                    except:
                        logger.info(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: failed to rewrite URL'
                        )

                success = False
                try:
                    logger.info(
                        'populate_metric_worker :: getting data from %s' %
                        str(fetch_url))
                    response = requests.get(fetch_url)
                    if response.status_code == 200:
                        success = True
                except:
                    logger.info(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: http status code - %s, reason - %s'
                        % (str(response.status_code), str(response.reason)))
                    logger.error(
                        'error :: populate_metric_worker :: failed to get data from %s'
                        % str(fetch_url))

                if not success:
                    continue

                datapoints = None
                try:
                    js = response.json()
                    if remote_host_type == 'graphite':
                        datapoints = js[0]['datapoints']
                    if remote_host_type == 'prometheus':
                        datapoints = js['data']['result'][0]['values']
                    datapoints_fetched = len(datapoints)
                    logger.info(
                        'populate_metric_worker :: retrieved %s data points from %s'
                        % (str(datapoints_fetched), str(fetch_url)))
                except:
                    logger.info(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to get data from %s'
                        % str(fetch_url))

                # Example
                # datapoints[0]
                # [7.3, 1556817000]
                # Add each data point and timestamp to the timeseries list so
                # they can be sent to Graphite
                if not datapoints:
                    logger.info(
                        'populate_metric_worker :: failed to get any data from %s'
                        % str(fetch_url))
                    continue

                # @added 20191108 - Bug #3312: flux - populate_metric_worker - handle None in datapoints
                valid_datapoints = []
                for datapoint in datapoints:
                    value = None
                    timestamp = None
                    if remote_host_type == 'graphite':
                        # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints
                        raw_timeseries.append([datapoint[1], datapoint[0]])

                        try:
                            raw_value = datapoint[0]
                            if raw_value is None:
                                datapoints_with_no_value += 1
                                continue
                            value = float(datapoint[0])
                            timestamp = int(datapoint[1])
                            valid_datapoints.append([value, timestamp])
                        except:
                            continue
                    if remote_host_type == 'prometheus':
                        # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints
                        raw_timeseries.append([datapoint[0], datapoint[1]])

                        try:
                            raw_value = datapoint[1]
                            if raw_value is None:
                                datapoints_with_no_value += 1
                                continue
                            timestamp = int(datapoint[0])
                            value = float(datapoint[1])
                        except:
                            continue
                        valid_datapoints.append([timestamp, value])
                datapoints = valid_datapoints

                # Order the time series by timestamp as the tuple can shift
                # order resulting in more recent data being added before older
                # data
                datapoints.sort()

                # Determine the timestamp of the current minute to apply
                # VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE
                time_now = int(time())
                current_minute_hour = int(
                    datetime.datetime.utcfromtimestamp(time_now).strftime(
                        '%H'))
                current_minute_minute = int(
                    datetime.datetime.utcfromtimestamp(time_now).strftime(
                        '%M'))
                current_datetime = datetime.datetime.utcfromtimestamp(
                    time_now).replace(hour=current_minute_hour,
                                      minute=current_minute_minute,
                                      second=0,
                                      microsecond=0)
                current_minute_timestamp_start = int(
                    current_datetime.strftime('%s'))
                datapoints_in_current_minute = 0

                last_error = None
                value = None
                timestamp = None
                for datapoint in datapoints:
                    try:
                        if remote_host_type == 'graphite':
                            try:
                                raw_value = datapoint[0]
                                if raw_value is None:
                                    continue
                                value = float(datapoint[0])
                                timestamp = int(datapoint[1])
                            except:
                                continue
                        if remote_host_type == 'prometheus':
                            # timestamp = int(datapoint[0])
                            try:
                                timestamp = int(datapoint[0])
                                value = float(datapoint[1])
                            except:
                                continue
                        submit_data = True
                        if last_flux_timestamp:
                            if timestamp <= last_flux_timestamp:
                                submit_data = False
                                datapoints_already_populated += 1

                        # Here if the timestamp of the data point falls
                        # within the current minute, it is discarded and not
                        # sent to flux, to ensure that high frequency metrics
                        # can have their minutely bins fully populated before
                        # they are submitted to Graphite
                        if settings.VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE:
                            if timestamp >= current_minute_timestamp_start:
                                submit_data = False
                                datapoints_in_current_minute += 1
                        if submit_data:
                            new_datapoint = [timestamp, value]
                            timeseries.append(new_datapoint)
                            datapoints_added_to_timeseries += 1
                    # nosec to exclude from bandit tests
                    except:  # nosec
                        last_error = traceback.format_exc()
                        datapoints_with_no_value += 1
                        continue

                if last_error:
                    logger.error(last_error)
                    logger.error(
                        'error :: populate_metric_worker :: the above is the last_error encountered processing %s'
                        % (str(metric)))
                if datapoints_with_no_value:
                    logger.info(
                        'populate_metric_worker :: %s of the fetched records were discarded as they had value None'
                        % (str(datapoints_with_no_value)))
                if datapoints_in_current_minute:
                    logger.info(
                        'populate_metric_worker :: %s of the fetched records were discarded as they fall within the current minute'
                        % (str(datapoints_in_current_minute)))
                logger.info(
                    'populate_metric_worker :: %s of the fetched data points are older than the last known flux timestamp'
                    % (str(datapoints_already_populated)))
                logger.info(
                    'populate_metric_worker :: added %s data points to the time series to submit to Graphite'
                    % (str(datapoints_added_to_timeseries)))

            end_fecthing = int(time())
            seconds_to_fetch = end_fecthing - start_populating
            if timestamp:
                logger.info(
                    'populate_metric_worker :: last fetched value - %s, timestamp %s'
                    % (str(value), str(timestamp)))
            logger.info(
                'populate_metric_worker :: %s data point fecthed for %s in %s seconds'
                % (str(datapoints_added_to_timeseries), remote_target,
                   str(seconds_to_fetch)))

            # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints
            # And set flux.last key is the returned value from the remote is
            # null so that time series that are mostly null do not keep on
            # getting added to flux populate_metric by Vista
            if not timeseries:
                set_flux_key = False
                try:
                    sorted_raw_timeseries = sorted(raw_timeseries,
                                                   key=lambda x: x[0])
                    last_ts = sorted_raw_timeseries[-1][0]
                    if int(last_ts) > (end_fecthing - 120):
                        if sorted_raw_timeseries[-1][1] is None:
                            set_flux_key = True
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to determine if last value was null'
                    )
                if set_flux_key:
                    try:
                        # Update Redis flux key
                        cache_key = 'flux.last.%s' % metric
                        metric_data = [int(last_ts), None]
                        self.redis_conn.set(cache_key, str(metric_data))
                        logger.info(
                            'populate_metric_worker :: even though no data points so as to not loop round on this metric, set the metric Redis key - %s - %s'
                            % (cache_key, str(metric_data)))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: even though no data points, failed to set Redis key - %s - %s'
                            % (cache_key, str(metric_data)))
                    # Adding to the vista.fetcher.unique_metrics Redis set
                    redis_set = 'vista.fetcher.unique_metrics'
                    data = str(remote_target)
                    try:
                        self.redis_conn.sadd(redis_set, data)
                        logger.info(
                            'populate_metric_worker :: even though no data points, added %s to Redis set %s'
                            % (remote_target, redis_set))
                    except:
                        logger.info(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: even though no data points, failed to add %s to Redis set %s'
                            % (str(data), str(redis_set)))

            if not timeseries:
                logger.info(
                    'populate_metric_worker :: no data in the timeseries list for the time series for %s'
                    % metric)
                continue

            # Order the time series by timestamp as the tuple can shift
            # order resulting in more recent data being added before older
            # data
            timeseries.sort()
            timeseries_length = len(timeseries)

            # Resample
            resample_at = '1Min'
            if resample_at:
                try:
                    df = pd.DataFrame(timeseries)
                    df.columns = ['timestamp', 'value']
                    df['timestamp'] = pd.to_datetime(df['timestamp'],
                                                     unit='s',
                                                     origin='unix')
                    df = df.set_index('timestamp')
                    # resampled_df = df.resample(resample_at).sum()
                    # Use the mean as Prometheus uses the average in the
                    # query_range API method
                    resampled_df = df.resample(resample_at).mean()
                    resampled_timeseries = []
                    for index, row in resampled_df.iterrows():
                        timestamp = int(index.strftime('%s'))
                        resampled_timeseries.append([timestamp, row[0]])
                    timeseries = resampled_timeseries
                    timeseries_length = len(timeseries)
                    logger.info(
                        'populate_metric_worker :: time series resampled at %s resulting in %s data points to send to Graphite'
                        % (str(resample_at), str(timeseries_length)))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to resample time series for %s'
                        % str(metric))

            logger.info(
                'populate_metric_worker :: %s data points to send to Graphite'
                % (str(timeseries_length)))
            timestamp = None
            value = None
            sent_to_graphite = 0

            # use_pickle = False
            use_pickle = True
            if not use_pickle:
                for timestamp, value in timeseries:
                    try:
                        graphyte.send(metric, float(value), int(timestamp))
                        sent_to_graphite += 1
                        if sent_to_graphite % 1000 == 0:
                            logger.info(
                                'populate_metric_worker :: submitted %s of %s data points to Graphite so far'
                                % (str(sent_to_graphite),
                                   str(timeseries_length)))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: failed to send metric data to Graphite for %s'
                            % str(metric))
            else:
                listOfMetricTuples = []
                try:
                    for timestamp, value in timeseries:
                        tuple_data = (metric, (int(timestamp), float(value)))
                        listOfMetricTuples.append(tuple_data)
                        sent_to_graphite += 1
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to populate listOfMetricTuples for %s'
                        % str(metric))
                if listOfMetricTuples:
                    data_points_sent = 0
                    smallListOfMetricTuples = []
                    tuples_added = 0
                    for data in listOfMetricTuples:
                        smallListOfMetricTuples.append(data)
                        tuples_added += 1
                        if tuples_added >= 1000:
                            pickle_data_sent = pickle_data_to_graphite(
                                smallListOfMetricTuples)
                            if pickle_data_sent:
                                data_points_sent += tuples_added
                                logger.info(
                                    'populate_metric_worker :: sent %s/%s of %s data points to Graphite via pickle for %s'
                                    %
                                    (str(tuples_added), str(data_points_sent),
                                     str(timeseries_length), metric))
                                sent_to_graphite += len(
                                    smallListOfMetricTuples)
                                smallListOfMetricTuples = []
                                tuples_added = 0
                            else:
                                logger.error(
                                    'error :: populate_metric_worker :: failed to send %s data points to Graphite via pickle for %s'
                                    % (str(tuples_added), metric))
                    if smallListOfMetricTuples:
                        tuples_to_send = len(smallListOfMetricTuples)
                        pickle_data_sent = pickle_data_to_graphite(
                            smallListOfMetricTuples)
                        if pickle_data_sent:
                            data_points_sent += tuples_to_send
                            logger.info(
                                'populate_metric_worker :: sent the last %s/%s of %s data points to Graphite via pickle for %s'
                                % (str(tuples_to_send), str(data_points_sent),
                                   str(timeseries_length), metric))
                        else:
                            logger.error(
                                'error :: populate_metric_worker :: failed to send the last %s data points to Graphite via pickle for %s'
                                % (str(tuples_to_send), metric))

            logger.info(
                'populate_metric_worker :: sent %s data points to Graphite for %s'
                % (str(sent_to_graphite), metric))
            try:
                skyline_metric = '%s.datapoints_sent_to_graphite' % (
                    skyline_app_graphite_namespace)
                # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host
                # graphyte.send(skyline_metric, float(sent_to_graphite), int(time()))
                send_graphite_metric(skyline_app, skyline_metric,
                                     float(sent_to_graphite))
                logger.info(
                    'populate_metric_worker :: submitted %s to Graphite for %s'
                    % (str(float(sent_to_graphite)), skyline_metric))
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: populate_metric_worker :: failed to send metric data to Graphite for %s'
                    % str(skyline_metric))

            has_value = False
            if value == 0.0:
                has_value = True
            if value == 0:
                has_value = True
            if value:
                has_value = True

            if timestamp and has_value:
                try:
                    # Update Redis flux key
                    cache_key = 'flux.last.%s' % metric
                    metric_data = [int(timestamp), float(value)]
                    self.redis_conn.set(cache_key, str(metric_data))
                    logger.info(
                        'populate_metric_worker :: set the metric Redis key - %s - %s'
                        % (cache_key, str(metric_data)))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to set Redis key - %s - %s'
                        % (cache_key, str(metric_data)))

                # Adding to the vista.fetcher.unique_metrics Redis set
                redis_set = 'vista.fetcher.unique_metrics'
                data = str(remote_target)
                try:
                    self.redis_conn.sadd(redis_set, data)
                    logger.info(
                        'populate_metric_worker :: added %s to Redis set %s' %
                        (remote_target, redis_set))
                except:
                    logger.info(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to add %s to Redis set %s'
                        % (str(data), str(redis_set)))

            end_populating = int(time())
            seconds_to_run = end_populating - start_populating
            logger.info(
                'populate_metric_worker :: %s populated to Graphite in %s seconds'
                % (metric, str(seconds_to_run)))
Esempio n. 21
0
# @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow
# @modified 20191030 - Bug #3266: py3 Redis binary objects not strings
#                      Branch #3262: py3
# Use get_redis_conn and get_redis_conn_decoded to use on Redis sets when the bytes
# types need to be decoded as utf-8 to str
# if settings.REDIS_PASSWORD:
#     redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
# else:
#     redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)

# @added 20191030 - Bug #3266: py3 Redis binary objects not strings
#                   Branch #3262: py3
# Added a single functions to deal with Redis connection and the
# charset='utf-8', decode_responses=True arguments required in py3
redis_conn = get_redis_conn(skyline_app)
redis_conn_decoded = get_redis_conn_decoded(skyline_app)


def get_anomaly(request_type):
    """
    Query the database for the anomaly details
    """

    logger = logging.getLogger(skyline_app_logger)

    if isinstance(request_type, int):
        latest = False
    else:
        latest = True
Esempio n. 22
0
    def run(self):
        """
        - Called when the process intializes.

        - Determine if Redis is up

        - Spawn a rolling process to do checks

        - Wait for the process to finish.

        - run_every 60 seconds
        """

        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        now = time()
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = time()
            else:
                now = log_wait_for + 1

        logger.info('thunder/rolling :: starting %s/rolling' % skyline_app)

        try:
            SERVER_METRIC_PATH = '.%s' % settings.SERVER_METRICS_NAME
            if SERVER_METRIC_PATH == '.':
                SERVER_METRIC_PATH = ''
        except Exception as e:
            SERVER_METRIC_PATH = ''
            logger.warning(
                'warning :: thunder/rolling :: settings.SERVER_METRICS_NAME is not declared in settings.py, defaults to \'\' - %s'
                % e)

        run_every = 60

        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except Exception as e:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: thunder/rolling cannot connect to redis at socket path %s - %s'
                    % (settings.REDIS_SOCKET_PATH, e))
                sleep(10)
                try:
                    self.redis_conn = get_redis_conn(skyline_app)
                    self.redis_conn_decoded = get_redis_conn_decoded(
                        skyline_app)
                except Exception as e:
                    logger.info(traceback.format_exc())
                    logger.error(
                        'error :: thunder/rolling cannot connect to get_redis_conn - %s'
                        % e)
                continue

            # Report app up
            try:
                self.redis_conn.setex('thunder.rolling', 120, now)
            except Exception as e:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: thunder/rolling :: could not update the Redis analyzer.thunder/rolling key - %s'
                    % e)

            # Spawn processes
            pids = []
            spawned_pids = []
            pid_count = 0
            try:
                p = Process(target=self.rolling_process, args=(0, ))
                pids.append(p)
                pid_count += 1
                logger.info('thunder/rolling :: starting rolling_process')
                p.start()
                spawned_pids.append(p.pid)
            except Exception as e:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: thunder/rolling :: failed to spawn process - %s'
                    % e)

            # Self monitor processes and terminate if any rolling_process that
            # has run for longer than 180 seconds
            p_starts = time()
            while time() - p_starts <= run_every:
                if any(p.is_alive() for p in pids):
                    # Just to avoid hogging the CPU
                    sleep(.1)
                else:
                    # All the processes are done, break now.
                    time_to_run = time() - p_starts
                    logger.info(
                        'thunder/rolling :: rolling_process completed in %.2f seconds'
                        % (time_to_run))
                    break
            else:
                # We only enter this if we didn't 'break' above.
                logger.info(
                    'thunder/rolling :: timed out, killing rolling_process process'
                )
                for p in pids:
                    logger.info(
                        'thunder/rolling :: killing rolling_process process')
                    p.terminate()
                    logger.info(
                        'thunder/rolling :: killed rolling_process process')

            for p in pids:
                if p.is_alive():
                    try:
                        logger.info(
                            'thunder/rolling :: stopping rolling_process - %s'
                            % (str(p.is_alive())))
                        p.terminate()
                    except Exception as e:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: thunder/rolling :: failed to stop rolling_process - %s'
                            % e)

            process_runtime = time() - now
            if process_runtime < run_every:
                sleep_for = (run_every - process_runtime)

                process_runtime_now = time() - now
                sleep_for = (run_every - process_runtime_now)

                logger.info(
                    'thunder/rolling :: sleeping for %.2f seconds due to low run time...'
                    % sleep_for)
                sleep(sleep_for)
                try:
                    del sleep_for
                except Exception as e:
                    logger.error(
                        'error :: thunder/rolling :: failed to del sleep_for - %s'
                        % e)
            try:
                del process_runtime
            except Exception as e:
                logger.error(
                    'error :: thunder/rolling :: failed to del process_runtime - %s'
                    % e)
Esempio n. 23
0
    def run(self):
        """
        - Called when the process intializes.

        - Determine if Redis is up and discover checks to run.

        - Divide and assign each process a metric check to analyse and add
          results to source Redis set.

        - Wait for the processes to finish.

        """

        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        if os.path.isfile(skyline_app_logwait):
            try:
                os.remove(skyline_app_logwait)
            except OSError:
                logger.error('error - failed to remove %s, continuing' %
                             skyline_app_logwait)
                pass

        now = time()
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = time()
            else:
                now = log_wait_for + 1

        logger.info('starting %s run' % skyline_app)
        if os.path.isfile(skyline_app_loglock):
            logger.error(
                'error - bin/%s.d log management seems to have failed, continuing'
                % skyline_app)
            try:
                os.remove(skyline_app_loglock)
                logger.info('log lock file removed')
            except OSError:
                logger.error('error - failed to remove %s, continuing' %
                             skyline_app_loglock)
                pass
        else:
            logger.info('bin/%s.d log management done' % skyline_app)

        logger.info('starting SNAB_flux_load_test')

        while 1:
            now = time()
            # Make sure Redis is up
            try:
                self.redis_conn.ping()
                logger.info('pinged Redis via get_redis_conn')
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: cannot connect to redis at socket path %s' %
                    settings.REDIS_SOCKET_PATH)
                sleep(10)
                try:
                    self.redis_conn = get_redis_conn(skyline_app)
                    logger.info('connected via get_redis_conn')
                except:
                    logger.error(traceback.format_exc())
                    logger.error('error :: not connected via get_redis_conn')
                continue
            try:
                self.redis_conn_decoded.ping()
                logger.info('pinged Redis via get_redis_conn_decoded')
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: not connected via get_redis_conn_decoded')
                sleep(10)
                try:
                    self.redis_conn_decoded = get_redis_conn_decoded(
                        skyline_app)
                    logger.info('connected via get_redis_conn_decoded')
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: cannot connect to get_redis_conn_decoded')
                continue
            """
            Run load test
            """
            while True:

                current_timestamp = int(time())

                logger.info('snab_flux_load_test - running load test')

                # Spawn processes
                pids = []
                spawned_pids = []
                pid_count = 0
                p = Process(target=self.spin_snab_flux_load_test_process,
                            args=(current_timestamp, ))
                pids.append(p)
                pid_count += 1
                logger.info('starting 1 of %s spin_snab_process' %
                            (str(pid_count)))
                p.start()
                spawned_pids.append(p.pid)

                # Send wait signal to zombie processes
                # for p in pids:
                #     p.join()
                # Self monitor processes and terminate if any spin_snab_process
                # that has run for longer than 58 seconds
                p_starts = time()
                while time() - p_starts <= 58:
                    if any(p.is_alive() for p in pids):
                        # Just to avoid hogging the CPU
                        sleep(.1)
                    else:
                        # All the processes are done, break now.
                        time_to_run = time() - p_starts
                        logger.info(
                            '1 spin_snab_flux_load_test_process completed in %.2f seconds'
                            % (time_to_run))
                        break
                else:
                    # We only enter this if we didn't 'break' above.
                    logger.info(
                        'timed out, killing spin_snab_flux_load_test_process process'
                    )
                    for p in pids:
                        p.terminate()
                        # p.join()

                for p in pids:
                    if p.is_alive():
                        logger.info(
                            'stopping spin_snab_flux_load_test_process - %s' %
                            (str(p.is_alive())))
                        p.join()

                process_runtime = time() - current_timestamp
                if process_runtime < 60:
                    sleep_for = (60 - process_runtime)
                    logger.info('sleeping for %.2f seconds' % sleep_for)
                    sleep(sleep_for)
                    try:
                        del sleep_for
                    except:
                        pass
Esempio n. 24
0
    def run(self):
        """
        Called when the process intializes.
        """

        logger.info('worker :: starting worker')

        last_sent_to_graphite = int(time())
        metrics_sent_to_graphite = 0

        # Populate API keys and tokens in memcache
        # python-2.x and python3.x handle while 1 and while True differently
        # while 1:
        running = True
        while running:
            # Make sure Redis is up
            redis_up = False
            while not redis_up:
                try:
                    redis_up = self.redis_conn.ping()
                except:
                    logger.error(
                        'worker :: cannot connect to redis at socket path %s' %
                        (settings.REDIS_SOCKET_PATH))
                    sleep(2)
                    # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings
                    #                      Branch #3262: py3
                    # Use get_redis_conn and get_redis_conn_decoded
                    # if settings.REDIS_PASSWORD:
                    #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
                    # else:
                    #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                    self.redis_conn = get_redis_conn(skyline_app)
                    self.redis_conn_decoded = get_redis_conn_decoded(
                        skyline_app)

            if LOCAL_DEBUG:
                try:
                    metric_data_queue_size = self.q.qsize()
                    logger.info(
                        'worker :: debug :: flux.httpMetricDataQueue queue size - %s'
                        % str(metric_data_queue_size))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to determine size of queue flux.httpMetricDataQueue'
                    )

            metric_data = None
            try:
                # Get a metric from the queue with a 1 second timeout, each
                # metric item on the queue is a list e.g.
                # metric_data = [metricName, metricValue, metricTimestamp]
                metric_data = self.q.get(True, 1)

            except Empty:
                logger.info('worker :: queue is empty and timed out')
                sleep(1)
            except NotImplementedError:
                pass
            except KeyboardInterrupt:
                logger.info(
                    'worker :: server has been issued a user signal to terminate - KeyboardInterrupt'
                )
            except SystemExit:
                logger.info('worker :: server was interrupted - SystemExit')
            except Exception as e:
                logger.error('error :: worker :: %s' % (str(e)))

            # @added 20200206 - Feature #3444: Allow flux to backfill
            # Added backfill
            backfill = False

            if metric_data:
                try:
                    metric = str(metric_data[0])
                    value = float(metric_data[1])
                    timestamp = int(metric_data[2])
                    # @added 20200206 - Feature #3444: Allow flux to backfill
                    # Added backfill
                    backfill = int(metric_data[3])
                    if LOCAL_DEBUG:
                        logger.info(
                            'worker :: debug :: queue item found - %s' %
                            str(metric_data))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to interpolate metric, value, timestamp from metric_data - %s'
                        % str(metric_data))
                    continue

                if settings.FLUX_SEND_TO_CARBON:
                    # Best effort de-duplicate the data
                    valid_data = True

                    # @added 20200818 - Feature #3694: flux - POST multiple metrics
                    # Handle Redis and literal_eval separately
                    redis_last_metric_data = None

                    # @modified 20200206 - Feature #3444: Allow flux to backfill
                    # Only check flux.last key if this is not backfill
                    if not backfill:
                        cache_key = 'flux.last.%s' % metric
                        last_metric_timestamp = None
                        try:
                            # @modified 20191128 - Bug #3266: py3 Redis binary objects not strings
                            #                      Branch #3262: py3
                            # redis_last_metric_data = self.redis_conn.get(cache_key)
                            redis_last_metric_data = self.redis_conn_decoded.get(
                                cache_key)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to determine last_metric_timestamp from Redis key %s'
                                % str(cache_key))
                            redis_last_metric_data = None

                        # @modified 20200818 - Feature #3694: flux - POST multiple metrics
                        # Handle Redis and literal_eval separately, only
                        # literal_eval if Redis had data for the key
                        if redis_last_metric_data:
                            try:
                                last_metric_data = literal_eval(
                                    redis_last_metric_data)
                                last_metric_timestamp = int(
                                    last_metric_data[0])
                                if LOCAL_DEBUG:
                                    logger.info(
                                        'worker :: debug :: last_metric_timestamp for %s from %s is %s'
                                        % (metric, str(cache_key),
                                           str(last_metric_timestamp)))
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: worker :: failed to determine last_metric_timestamp from Redis key %s'
                                    % str(cache_key))
                                last_metric_timestamp = False

                        if last_metric_timestamp:
                            if timestamp <= last_metric_timestamp:
                                valid_data = False
                                if LOCAL_DEBUG:
                                    logger.info(
                                        'worker :: debug :: not valid data - the queue data timestamp %s is <= to the last_metric_timestamp %s for %s'
                                        % (str(timestamp),
                                           str(last_metric_timestamp), metric))

                    if valid_data:
                        submittedToGraphite = False
                        try:
                            graphyte.send(metric, value, timestamp)
                            submittedToGraphite = True
                            logger.info(
                                'worker :: sent %s, %s, %s to Graphite' %
                                (str(metric), str(value), str(timestamp)))
                            metrics_sent_to_graphite += 1
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to send metric data to Graphite for %s'
                                % str(metric))
                            metric = None
                        if submittedToGraphite:
                            # Update the metric Redis flux key
                            # @modified 20200206 - Feature #3444: Allow flux to backfill
                            # Only update the flux.last key if this is not backfill
                            if not backfill:
                                metric_data = [timestamp, value]
                                self.redis_conn.set(cache_key,
                                                    str(metric_data))
                            # @added 20200213 - Bug #3448: Repeated airgapped_metrics
                            else:
                                # @added 20200213 - Bug #3448: Repeated airgapped_metrics
                                # Add a flux.filled key to Redis with a expiry
                                # set to FULL_DURATION so that Analyzer knows to
                                # sort and deduplicate the Redis time series
                                # data as carbon-relay will send it to Horizon
                                # and the datapoints will be out of order in the
                                # Redis key
                                try:
                                    flux_filled_key = 'flux.filled.%s' % str(
                                        metric)
                                    self.redis_conn.setex(
                                        flux_filled_key,
                                        settings.FULL_DURATION, int(time()))
                                    logger.info('worker :: set Redis key %s' %
                                                (str(flux_filled_key)))
                                except Exception as e:
                                    logger.error(
                                        'error :: failed to could not set Redis flux.filled key: %s'
                                        % e)
                    else:
                        logger.info(
                            'worker :: discarded %s, %s, %s as a data point for %s has already been submitted to Graphite'
                            % (str(metric), str(value), str(timestamp),
                               str(timestamp)))
                else:
                    logger.info(
                        'worker :: settings.FLUX_SEND_TO_CARBON is set to %s, discarded %s, %s, %s'
                        % (str(settings.FLUX_SEND_TO_CARBON), str(metric),
                           str(value), str(timestamp)))

                if settings.FLUX_SEND_TO_STATSD:
                    statsd_conn.incr(metric, value, timestamp)
                    logger.info('worker sent %s, %s, %s to statsd' %
                                (metric, str(value), str(timestamp)))

            time_now = int(time())
            if (time_now - last_sent_to_graphite) >= 60:
                logger.info(
                    'worker :: metrics_sent_to_graphite in last 60 seconds - %s'
                    % str(metrics_sent_to_graphite))
                skyline_metric = '%s.metrics_sent_to_graphite' % skyline_app_graphite_namespace
                try:
                    # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host
                    # graphyte.send(skyline_metric, metrics_sent_to_graphite, time_now)
                    send_graphite_metric(skyline_app, skyline_metric,
                                         metrics_sent_to_graphite)
                    last_sent_to_graphite = int(time())
                    metrics_sent_to_graphite = 0
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to send_graphite_metric %s with %s'
                        % (skyline_metric, str(metrics_sent_to_graphite)))
Esempio n. 25
0
def update_redis_set(
        current_skyline_app, redis_set, original_data_str, update_data_str,
        log=True):
    """
    Manage data in a Redis set.

    :param current_skyline_app: the app calling the function
    :param redis_set: the Redis key name of the set
    :param original_data: the data in the set which to take action on
    :param update_data_str: the updated data or the string 'remove' to remove
        the data from the set.
    :param log: whether to log or not, optional, defaults to True
    :type current_skyline_app: str
    :type redis_set: str
    :type original_data: str
    :type update_data_str: str
    :type log: boolean

    """

    function_str = 'functions.redis.update_set'

    if log:
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
    else:
        current_logger = None

    try:
        redis_conn = get_redis_conn(current_skyline_app)
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error('error :: %s :: failed to connect to Redis to manage data in Redis set %s - %s' % (
            function_str, redis_set, e))
    try:
        redis_conn.srem(redis_set, str(original_data_str))
        # @added 20220110 - Bug #4364: Prune old thunder.events
        #                   Branch #1444: thunder
        if log:
            current_logger.info('removed item from Redis set %s - %s' % (
                redis_set, str(original_data_str)))
    except Exception as e:
        if not log:
            current_skyline_app_logger = current_skyline_app + 'Log'
            current_logger = logging.getLogger(current_skyline_app_logger)
        current_logger.error(traceback.format_exc())
        current_logger.error('error :: %s :: failed to remove item from Redis set %s - %s' % (
            function_str, redis_set, e))
    if update_data_str != 'remove':
        try:
            redis_conn.sadd(redis_set, str(update_data_str))
            # @added 20220110 - Bug #4364: Prune old thunder.events
            #                   Branch #1444: thunder
            if log:
                current_logger.info('added updated item to Redis set %s - %s' % (
                    redis_set, str(update_data_str)))
        except Exception as e:
            if not log:
                current_skyline_app_logger = current_skyline_app + 'Log'
                current_logger = logging.getLogger(current_skyline_app_logger)
            current_logger.error(traceback.format_exc())
            current_logger.error('error :: %s :: failed to update item in Redis set %s - %s' % (
                function_str, redis_set, e))
    return
Esempio n. 26
0
    def run(self):
        """
        Called when the process intializes.
        """
        def pickle_data_to_graphite(data):

            message = None
            try:
                payload = pickle.dumps(data, protocol=2)
                header = struct.pack("!L", len(payload))
                message = header + payload
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: worker :: failed to pickle to send to Graphite')
                return False
            if message:
                try:
                    sock = socket.socket()
                    sock.connect(
                        (CARBON_HOST, settings.FLUX_CARBON_PICKLE_PORT))
                    sock.sendall(message)
                    sock.close()
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to send pickle data to Graphite'
                    )
                    return False
            else:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: worker :: failed to pickle metric data into message'
                )
                return False
            return True

        def submit_pickle_data_to_graphite(pickle_data):

            # @modified 20201207 - Task #3864: flux - try except everything
            try:
                number_of_datapoints = len(pickle_data)
            except Exception as e:
                logger.error(
                    'error :: worker :: could not determine number_of_datapoints from len(pickle_data) - %s'
                    % str(e))
                return False

            data_points_sent = 0
            smallListOfMetricTuples = []
            tuples_added = 0

            for data in pickle_data:
                # @modified 20201207 - Task #3864: flux - try except everything
                try:
                    smallListOfMetricTuples.append(data)
                    tuples_added += 1
                    if tuples_added >= 480:
                        # @modified 20201207 - Task #3864: flux - try except everything
                        try:
                            pickle_data_sent = pickle_data_to_graphite(
                                smallListOfMetricTuples)
                        except Exception as e:
                            logger.error(
                                'error :: worker :: pickle_data_to_graphite error - %s'
                                % str(e))
                            pickle_data_sent = False

                        # Reduce the speed of submissions to Graphite
                        # if there are lots of data points
                        if number_of_datapoints > 4000:
                            sleep(0.3)
                        if pickle_data_sent:
                            data_points_sent += tuples_added
                            logger.info(
                                'worker :: sent %s/%s of %s data points to Graphite via pickle'
                                % (str(tuples_added), str(data_points_sent),
                                   str(number_of_datapoints)))
                            smallListOfMetricTuples = []
                            tuples_added = 0
                        else:
                            logger.error(
                                'error :: worker :: failed to send %s data points to Graphite via pickle'
                                % (str(tuples_added)))
                            return False
                except Exception as e:
                    logger.error(
                        'error :: worker :: error handling data in pickle_data - %s'
                        % str(e))
                    return False

            if smallListOfMetricTuples:
                # @modified 20201207 - Task #3864: flux - try except everything
                try:
                    tuples_to_send = len(smallListOfMetricTuples)
                    pickle_data_sent = pickle_data_to_graphite(
                        smallListOfMetricTuples)
                    if pickle_data_sent:
                        data_points_sent += tuples_to_send
                        logger.info(
                            'worker :: sent the last %s/%s of %s data points to Graphite via pickle'
                            % (str(tuples_to_send), str(data_points_sent),
                               str(number_of_datapoints)))
                    else:
                        logger.error(
                            'error :: failed to send the last %s data points to Graphite via pickle'
                            % (str(tuples_to_send)))
                        return False
                except Exception as e:
                    logger.error(
                        'error :: worker :: error in smallListOfMetricTuples pickle_data_to_graphite - %s'
                        % str(e))
                    return False

            return True

        logger.info('worker :: starting worker')

        last_sent_to_graphite = int(time())
        metrics_sent_to_graphite = 0

        # @added 20200827 - Feature #3708: FLUX_ZERO_FILL_NAMESPACES
        last_zero_fill_to_graphite = 0
        metrics_sent = []

        remove_from_flux_queue_redis_set = []

        # @added 20201019 - Feature #3790: flux - pickle to Graphite
        pickle_data = []
        # send_to_reciever = 'line'
        send_to_reciever = 'pickle'

        # @modified 20201207 - Task #3864: flux - try except everything
        try:
            metric_data_queue_size = self.q.qsize()
        except Exception as e:
            logger.error(
                'error :: worker :: could not determine metric_data_queue_size - %s'
                % str(e))
            metric_data_queue_size = 0

        if metric_data_queue_size > 10:
            send_to_reciever = 'pickle'

        # @added 202011120 - Feature #3790: flux - pickle to Graphite
        # Debug Redis set
        metrics_data_sent = []

        # @added 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP
        # Even if flux.last Redis keys are disabled in flux they are used in
        # Vista
        vista_metrics = []
        if not FLUX_CHECK_LAST_TIMESTAMP and VISTA_ENABLED:
            try:
                vista_metrics = list(
                    self.redis_conn_decoded.sscan_iter('vista.metrics',
                                                       match='*'))
            except:
                vista_metrics = []

        # Populate API keys and tokens in memcache
        # python-2.x and python3.x handle while 1 and while True differently
        # while 1:
        running = True
        while running:
            # Make sure Redis is up
            redis_up = False
            while not redis_up:
                try:
                    redis_up = self.redis_conn.ping()
                except:
                    logger.error(
                        'worker :: cannot connect to redis at socket path %s' %
                        (settings.REDIS_SOCKET_PATH))
                    sleep(2)
                    # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings
                    #                      Branch #3262: py3
                    # Use get_redis_conn and get_redis_conn_decoded
                    # if settings.REDIS_PASSWORD:
                    #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
                    # else:
                    #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                    # @modified 20201207 - Task #3864: flux - try except everything
                    try:
                        self.redis_conn = get_redis_conn(skyline_app)
                    except Exception as e:
                        logger.error(
                            'error :: worker :: could not get_redis_conn - %s'
                            % str(e))
                    try:
                        self.redis_conn_decoded = get_redis_conn_decoded(
                            skyline_app)
                    except Exception as e:
                        logger.error(
                            'error :: worker :: could not get_redis_conn_decoded - %s'
                            % str(e))

            if LOCAL_DEBUG:
                try:
                    metric_data_queue_size = self.q.qsize()
                    logger.info(
                        'worker :: debug :: flux.httpMetricDataQueue queue size - %s'
                        % str(metric_data_queue_size))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to determine size of queue flux.httpMetricDataQueue'
                    )

            metric_data = None
            try:
                # Get a metric from the queue with a 1 second timeout, each
                # metric item on the queue is a list e.g.
                # metric_data = [metricName, metricValue, metricTimestamp]
                metric_data = self.q.get(True, 1)

            except Empty:
                if pickle_data:
                    # @modified 20201207 - Task #3864: flux - try except everything
                    try:
                        pickle_data_submitted = submit_pickle_data_to_graphite(
                            pickle_data)
                    except Exception as e:
                        logger.error(
                            'error :: worker :: queue Empty failed to submit_pickle_data_to_graphite - %s'
                            % str(e))
                        pickle_data_submitted = False

                    if pickle_data_submitted:
                        pickle_data = []
                logger.info('worker :: queue is empty and timed out')
                sleep(1)
                # @added 20201017 - Feature #3788: snab_flux_load_test
                # Send to Graphite even if worker gets no metrics
                if (int(time()) - last_sent_to_graphite) >= 60:
                    logger.info(
                        'worker :: metrics_sent_to_graphite in last 60 seconds - %s'
                        % str(metrics_sent_to_graphite))
                    skyline_metric = '%s.metrics_sent_to_graphite' % skyline_app_graphite_namespace
                    try:
                        # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host
                        # graphyte.send(skyline_metric, metrics_sent_to_graphite, time_now)
                        send_graphite_metric(skyline_app, skyline_metric,
                                             metrics_sent_to_graphite)
                        last_sent_to_graphite = int(time())
                        metrics_sent_to_graphite = 0
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to send_graphite_metric %s with %s'
                            % (skyline_metric, str(metrics_sent_to_graphite)))
                    metric_data_queue_size = 0
                    try:
                        metric_data_queue_size = self.q.qsize()
                        logger.info(
                            'worker :: flux.httpMetricDataQueue queue size - %s'
                            % str(metric_data_queue_size))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to determine size of queue flux.httpMetricDataQueue'
                        )
                    skyline_metric = '%s.httpMetricDataQueue.size' % skyline_app_graphite_namespace
                    try:
                        send_graphite_metric(skyline_app, skyline_metric,
                                             metric_data_queue_size)
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to send_graphite_metric %s with %s'
                            % (skyline_metric, str(metrics_sent_to_graphite)))
                    # @added 20201019 - Feature #3790: flux - pickle to Graphite
                    if metric_data_queue_size > 10:
                        send_to_reciever = 'pickle'
                    else:
                        send_to_reciever = 'line'
                    send_to_reciever = 'pickle'

                    # @added 202011120 - Feature #3790: flux - pickle to Graphite
                    # Debug Redis set
                    metrics_data_sent_strs = []
                    for item in metrics_data_sent:
                        metrics_data_sent_strs.append(str(item))
                    if metrics_data_sent_strs:
                        try:
                            self.redis_conn.sadd('flux.metrics_data_sent',
                                                 *set(metrics_data_sent_strs))
                            logger.info(
                                'worker :: added %s items to the flux.metrics_data_sent Redis set'
                                % str(len(metrics_data_sent)))
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to determine size of flux.queue Redis set'
                            )
                        metrics_data_sent = []
                        try:
                            new_set = 'aet.flux.metrics_data_sent.%s' % str(
                                self.current_pid)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to current_pid for aet.flux.metrics_data_sent Redis set name'
                            )
                            new_set = 'aet.flux.metrics_data_sent'
                        try:
                            self.redis_conn.rename('flux.metrics_data_sent',
                                                   new_set)
                            logger.info(
                                'worker :: renamed flux.metrics_data_sent Redis set to %s'
                                % new_set)
                        # @added 20201128 - Feature #3820: HORIZON_SHARDS
                        # With metrics that come in at a frequency of less
                        # than 60 seconds, it is possible that this key will
                        # not exist as flux has not been sent metric data
                        # so this operation will error with no such key
                        except Exception as e:
                            traceback_str = traceback.format_exc()
                            if 'no such key' in e:
                                logger.warn(
                                    'warning :: worker :: failed to rename flux.metrics_data_sent to %s Redis set - flux has not recieved data in 60 seconds - %s'
                                    % (new_set, e))
                            else:
                                logger.error(traceback_str)
                                logger.error(
                                    'error :: worker :: failed to rename flux.metrics_data_sent to %s Redis set'
                                    % new_set)
                        try:
                            self.redis_conn.expire(new_set, 600)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to set 600 seconds TTL on %s Redis set'
                                % new_set)

                    # @added 20201018 - Feature #3798: FLUX_PERSIST_QUEUE
                    if FLUX_PERSIST_QUEUE:
                        redis_set_size = 0
                        try:
                            redis_set_size = self.redis_conn.scard(
                                'flux.queue')
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to determine size of flux.queue Redis set'
                            )
                        logger.info(
                            'worker - flux.queue Redis set size of %s before removal of %s items'
                            % (str(redis_set_size),
                               str(len(remove_from_flux_queue_redis_set))))
                        if remove_from_flux_queue_redis_set:
                            try:
                                self.redis_conn.srem(
                                    'flux.queue',
                                    *set(remove_from_flux_queue_redis_set))
                                remove_from_flux_queue_redis_set = []
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: worker :: failed to remove multiple items from flux.queue Redis set'
                                )
                            try:
                                redis_set_size = self.redis_conn.scard(
                                    'flux.queue')
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: worker :: failed to determine size of flux.queue Redis set'
                                )
                            logger.info(
                                'worker - flux.queue Redis set size of %s after the removal of items'
                                % (str(redis_set_size)))
                            remove_from_flux_queue_redis_set = []
                    # @added 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP
                    # Even if flux.last Redis keys are disabled in flux they are used in
                    # Vista
                    vista_metrics = []
                    if not FLUX_CHECK_LAST_TIMESTAMP and VISTA_ENABLED:
                        try:
                            vista_metrics = list(
                                self.redis_conn_decoded.sscan_iter(
                                    'vista.metrics', match='*'))
                        except:
                            vista_metrics = []
            except NotImplementedError:
                pass
            except KeyboardInterrupt:
                logger.info(
                    'worker :: server has been issued a user signal to terminate - KeyboardInterrupt'
                )
            except SystemExit:
                logger.info('worker :: server was interrupted - SystemExit')
            except Exception as e:
                logger.error('error :: worker :: %s' % (str(e)))

            # @added 20200206 - Feature #3444: Allow flux to backfill
            # Added backfill
            backfill = False

            # @added 20201018 - Feature #3798: FLUX_PERSIST_QUEUE
            if metric_data and FLUX_PERSIST_QUEUE:
                try:
                    # Do not remove each individual metrics from the flux.queue
                    # Redis set, add to a list that is removed in one srem *set
                    # operation each 60 seconds.  This is a more perfomant
                    # method and requires a single blocking call for a batch of
                    # metrics, rather than a blocking call for every metric.
                    # self.redis_conn.srem('flux.queue', str(metric_data))
                    remove_from_flux_queue_redis_set.append(str(metric_data))
                except:
                    pass

            if metric_data:
                try:
                    metric = str(metric_data[0])
                    value = float(metric_data[1])
                    timestamp = int(metric_data[2])
                    # @added 20200206 - Feature #3444: Allow flux to backfill
                    # Added backfill
                    backfill = int(metric_data[3])
                    if LOCAL_DEBUG:
                        logger.info(
                            'worker :: debug :: queue item found - %s' %
                            str(metric_data))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to interpolate metric, value, timestamp from metric_data - %s'
                        % str(metric_data))
                    continue

                # @added 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP
                # Only check flux.last key if this is not backfill and
                # FLUX_CHECK_LAST_TIMESTAMP is enable or it is in VISTA_ENABLED
                cache_key = None
                # if FLUX_CHECK_LAST_TIMESTAMP:
                cache_key = 'flux.last.%s' % metric
                check_flux_last_key = False
                if not backfill and FLUX_CHECK_LAST_TIMESTAMP:
                    check_flux_last_key = True
                if VISTA_ENABLED:
                    if metric in vista_metrics:
                        check_flux_last_key = True

                if settings.FLUX_SEND_TO_CARBON:
                    # Best effort de-duplicate the data
                    valid_data = True

                    # @added 20200818 - Feature #3694: flux - POST multiple metrics
                    # Handle Redis and literal_eval separately
                    redis_last_metric_data = None

                    # @modified 20200206 - Feature #3444: Allow flux to backfill
                    # Only check flux.last key if this is not backfill
                    # @modified 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP
                    # Use the check_flux_last_key value determined above
                    # if not backfill:
                    if check_flux_last_key:
                        # @modified 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP
                        # Set cache_key outside the conditional block
                        # cache_key = 'flux.last.%s' % metric
                        last_metric_timestamp = None
                        try:
                            # @modified 20191128 - Bug #3266: py3 Redis binary objects not strings
                            #                      Branch #3262: py3
                            # redis_last_metric_data = self.redis_conn.get(cache_key)
                            redis_last_metric_data = self.redis_conn_decoded.get(
                                cache_key)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to determine last_metric_timestamp from Redis key %s'
                                % str(cache_key))
                            redis_last_metric_data = None

                        # @modified 20200818 - Feature #3694: flux - POST multiple metrics
                        # Handle Redis and literal_eval separately, only
                        # literal_eval if Redis had data for the key
                        if redis_last_metric_data:
                            try:
                                last_metric_data = literal_eval(
                                    redis_last_metric_data)
                                last_metric_timestamp = int(
                                    last_metric_data[0])
                                if LOCAL_DEBUG:
                                    logger.info(
                                        'worker :: debug :: last_metric_timestamp for %s from %s is %s'
                                        % (metric, str(cache_key),
                                           str(last_metric_timestamp)))
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: worker :: failed to determine last_metric_timestamp from Redis key %s'
                                    % str(cache_key))
                                last_metric_timestamp = False

                        if last_metric_timestamp:
                            if timestamp <= last_metric_timestamp:
                                valid_data = False
                                if LOCAL_DEBUG:
                                    logger.info(
                                        'worker :: debug :: not valid data - the queue data timestamp %s is <= to the last_metric_timestamp %s for %s'
                                        % (str(timestamp),
                                           str(last_metric_timestamp), metric))

                    if valid_data:
                        submittedToGraphite = False
                        if send_to_reciever == 'line':
                            try:
                                graphyte.send(metric, value, timestamp)
                                submittedToGraphite = True
                                # modified 20201016 - Feature #3788: snab_flux_load_test
                                if FLUX_VERBOSE_LOGGING:
                                    logger.info(
                                        'worker :: sent %s, %s, %s to Graphite - via graphyte'
                                        % (str(metric), str(value),
                                           str(timestamp)))
                                metrics_sent_to_graphite += 1
                                # @added 20200827 - Feature #3708: FLUX_ZERO_FILL_NAMESPACES
                                metrics_sent.append(metric)
                                # @added 202011120 - Feature #3790: flux - pickle to Graphite
                                # Debug Redis set
                                metrics_data_sent.append(
                                    [metric, value, timestamp])
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: worker :: failed to send metric data to Graphite for %s'
                                    % str(metric))
                                metric = None
                        if send_to_reciever == 'pickle':
                            # @modified 20201212 - Task #3864: flux - try except everything
                            try:
                                tuple_data = (metric, (int(timestamp),
                                                       float(value)))
                                pickle_data.append(tuple_data)
                                if FLUX_VERBOSE_LOGGING:
                                    logger.info(
                                        'worker :: sending %s, %s, %s to Graphite - via pickle'
                                        % (str(metric), str(value),
                                           str(timestamp)))
                                submittedToGraphite = True
                                metrics_sent_to_graphite += 1
                                metrics_sent.append(metric)
                                # @added 202011120 - Feature #3790: flux - pickle to Graphite
                                # Debug Redis set
                                metrics_data_sent.append(
                                    [metric, value, timestamp])
                            except Exception as e:
                                logger.error(
                                    'error :: worker :: failed to append to pickle_data - %s'
                                    % str(e))

                        if submittedToGraphite:
                            # Update the metric Redis flux key
                            # @modified 20200206 - Feature #3444: Allow flux to backfill
                            # Only update the flux.last key if this is not backfill
                            # @modified 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP
                            # Use the check_flux_last_key value determined above
                            # if not backfill:
                            if check_flux_last_key:
                                metric_data = [timestamp, value]

                                # @modified 20201207 - Task #3864: flux - try except everything
                                try:
                                    self.redis_conn.set(
                                        cache_key, str(metric_data))
                                except Exception as e:
                                    logger.error(
                                        'error :: worker :: failed to set check_flux_last_key Redis key - %s'
                                        % str(e))

                            # @added 20200213 - Bug #3448: Repeated airgapped_metrics
                            else:
                                # @added 20201120 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP
                                #                   Feature #3400: Identify air gaps in the metric data
                                # Only execute if IDENTIFY_AIRGAPS is enabled
                                if IDENTIFY_AIRGAPS:
                                    # @added 20200213 - Bug #3448: Repeated airgapped_metrics
                                    # Add a flux.filled key to Redis with a expiry
                                    # set to FULL_DURATION so that Analyzer knows to
                                    # sort and deduplicate the Redis time series
                                    # data as carbon-relay will send it to Horizon
                                    # and the datapoints will be out of order in the
                                    # Redis key
                                    try:
                                        flux_filled_key = 'flux.filled.%s' % str(
                                            metric)
                                        self.redis_conn.setex(
                                            flux_filled_key,
                                            settings.FULL_DURATION,
                                            int(time()))
                                        logger.info(
                                            'worker :: set Redis key %s' %
                                            (str(flux_filled_key)))
                                    except Exception as e:
                                        logger.error(
                                            'error :: failed to could not set Redis flux.filled key: %s'
                                            % e)
                    else:
                        # modified 20201016 - Feature #3788: snab_flux_load_test
                        if FLUX_VERBOSE_LOGGING:
                            logger.info(
                                'worker :: discarded %s, %s, %s as a data point for %s has already been submitted to Graphite'
                                % (str(metric), str(value), str(timestamp),
                                   str(timestamp)))
                else:
                    logger.info(
                        'worker :: settings.FLUX_SEND_TO_CARBON is set to %s, discarded %s, %s, %s'
                        % (str(settings.FLUX_SEND_TO_CARBON), str(metric),
                           str(value), str(timestamp)))

                if settings.FLUX_SEND_TO_STATSD:
                    statsd_conn.incr(metric, value, timestamp)
                    # modified 20201016 - Feature #3788: snab_flux_load_test
                    if FLUX_VERBOSE_LOGGING:
                        logger.info('worker sent %s, %s, %s to statsd' %
                                    (metric, str(value), str(timestamp)))
                    # @added 20200827 - Feature #3708: FLUX_ZERO_FILL_NAMESPACES
                    metrics_sent.append(metric)

                submit_pickle_data = False
                if pickle_data:
                    number_of_datapoints = len(pickle_data)
                    if number_of_datapoints >= 1000:
                        submit_pickle_data = True
                    else:
                        try:
                            metric_data_queue_size = self.q.qsize()
                        except:
                            metric_data_queue_size = 0
                        if metric_data_queue_size == 0:
                            submit_pickle_data = True
                if submit_pickle_data:
                    # @modified 20201207 - Task #3864: flux - try except everything
                    try:
                        pickle_data_submitted = submit_pickle_data_to_graphite(
                            pickle_data)
                    except Exception as e:
                        logger.error(
                            'error :: worker :: submit_pickle_data_to_graphite failed - %s'
                            % str(e))
                        pickle_data_submitted = False

                    if pickle_data_submitted:
                        pickle_data = []

            time_now = int(time())

            # @added 20200827 - Feature #3708: FLUX_ZERO_FILL_NAMESPACES
            # Send 0 for any metric in the flux.zero_fill_metrics Redis set that
            # has not submitted data in the last 60 seconds.  The flux.last
            # Redis key is not updated for these sent 0 values so if the source
            # sends data for a timestamp in the period later (due to a lag, etc),
            # it will be valid and sent to Graphite.
            if FLUX_ZERO_FILL_NAMESPACES:
                if not last_zero_fill_to_graphite:
                    last_zero_fill_to_graphite = time_now - 60
                if (time_now - last_sent_to_graphite) >= 60:
                    try:
                        flux_zero_fill_metrics = list(
                            self.redis_conn_decoded.smembers(
                                'flux.zero_fill_metrics'))
                    except:
                        logger.info(traceback.format_exc())
                        logger.error(
                            'error :: failed to generate a list from flux.zero_fill_metrics Redis set'
                        )
                    for flux_zero_fill_metric in flux_zero_fill_metrics:
                        if flux_zero_fill_metric not in metrics_sent:
                            try:
                                graphyte.send(flux_zero_fill_metric, 0.0,
                                              time_now)
                                # modified 20201016 - Feature #3788: snab_flux_load_test
                                if FLUX_VERBOSE_LOGGING:
                                    logger.info(
                                        'worker :: zero fill - sent %s, %s, %s to Graphite'
                                        % (str(flux_zero_fill_metric),
                                           str(0.0), str(time_now)))
                                metrics_sent_to_graphite += 1
                                metrics_sent.append(metric)
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: worker :: zero fill - failed to send metric data to Graphite for %s'
                                    % str(flux_zero_fill_metric))
                                metric = None
                    last_zero_fill_to_graphite = time_now
                    metrics_sent = []

            if (time_now - last_sent_to_graphite) >= 60:
                if pickle_data:
                    # @modified 20201207 - Task #3864: flux - try except everything
                    try:
                        pickle_data_submitted = submit_pickle_data_to_graphite(
                            pickle_data)
                    except Exception as e:
                        logger.error(
                            'error :: worker :: submit_pickle_data_to_graphite failed last_sent_to_graphite >= 60 - %s'
                            % str(e))
                        pickle_data_submitted = False

                    if pickle_data_submitted:
                        pickle_data = []
                logger.info(
                    'worker :: metrics_sent_to_graphite in last 60 seconds - %s'
                    % str(metrics_sent_to_graphite))
                skyline_metric = '%s.metrics_sent_to_graphite' % skyline_app_graphite_namespace
                try:
                    # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host
                    # graphyte.send(skyline_metric, metrics_sent_to_graphite, time_now)
                    send_graphite_metric(skyline_app, skyline_metric,
                                         metrics_sent_to_graphite)
                    last_sent_to_graphite = int(time())
                    metrics_sent_to_graphite = 0
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to send_graphite_metric %s with %s'
                        % (skyline_metric, str(metrics_sent_to_graphite)))
                metric_data_queue_size = 0
                try:
                    metric_data_queue_size = self.q.qsize()
                    logger.info(
                        'worker :: flux.httpMetricDataQueue queue size - %s' %
                        str(metric_data_queue_size))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to determine size of queue flux.httpMetricDataQueue'
                    )
                skyline_metric = '%s.httpMetricDataQueue.size' % skyline_app_graphite_namespace
                try:
                    send_graphite_metric(skyline_app, skyline_metric,
                                         metric_data_queue_size)
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to send_graphite_metric %s with %s'
                        % (skyline_metric, str(metrics_sent_to_graphite)))
                # @added 20201019 - Feature #3790: flux - pickle to Graphite
                if metric_data_queue_size > 10:
                    send_to_reciever = 'pickle'
                else:
                    send_to_reciever = 'line'

                # @added 202011120 - Feature #3790: flux - pickle to Graphite
                # Debug Redis set
                metrics_data_sent_strs = []
                for item in metrics_data_sent:
                    metrics_data_sent_strs.append(str(item))
                if metrics_data_sent_strs:
                    try:
                        self.redis_conn.sadd('flux.metrics_data_sent',
                                             *set(metrics_data_sent_strs))
                        logger.info(
                            'worker :: added %s items to the flux.metrics_data_sent Redis set'
                            % str(len(metrics_data_sent)))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to determine size of flux.queue Redis set'
                        )
                    metrics_data_sent = []
                    try:
                        new_set = 'aet.flux.metrics_data_sent.%s' % str(
                            self.current_pid)
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to current_pid for aet.flux.metrics_data_sent Redis set name'
                        )
                        new_set = 'aet.flux.metrics_data_sent'
                    try:
                        self.redis_conn.rename('flux.metrics_data_sent',
                                               new_set)
                        logger.info(
                            'worker :: renamed flux.metrics_data_sent Redis set to %s'
                            % new_set)
                    # @modified 20201128 - Feature #3820: HORIZON_SHARDS
                    # With metrics that come in at a frequency of less
                    # than 60 seconds, it is possible that this key will
                    # not exist as flux has not been sent metric data
                    # so this operation will error with no such key
                    except Exception as e:
                        traceback_str = traceback.format_exc()
                        if 'no such key' in e:
                            logger.warn(
                                'warning :: worker :: failed to rename flux.metrics_data_sent to %s Redis set - flux has not recieved data in 60 seconds - %s'
                                % (new_set, e))
                        else:
                            logger.error(traceback_str)
                            logger.error(
                                'error :: worker :: failed to rename flux.metrics_data_sent to %s Redis set'
                                % new_set)

                    try:
                        self.redis_conn.expire(new_set, 600)
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to set 600 seconds TTL on %s Redis set'
                            % new_set)

                # @added 20201018 - Feature #3798: FLUX_PERSIST_QUEUE
                if FLUX_PERSIST_QUEUE:
                    redis_set_size = 0
                    try:
                        redis_set_size = self.redis_conn.scard('flux.queue')
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to determine size of flux.queue Redis set'
                        )
                    logger.info(
                        'worker - flux.queue Redis set size %s before removal of %s items'
                        % (str(redis_set_size),
                           str(len(remove_from_flux_queue_redis_set))))
                    if remove_from_flux_queue_redis_set:
                        try:
                            self.redis_conn.srem(
                                'flux.queue',
                                *set(remove_from_flux_queue_redis_set))
                            remove_from_flux_queue_redis_set = []
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to remove multiple items from flux.queue Redis set'
                            )
                        try:
                            redis_set_size = self.redis_conn.scard(
                                'flux.queue')
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to determine size of flux.queue Redis set'
                            )
                        logger.info(
                            'worker - flux.queue Redis set size of %s after the removal of items'
                            % (str(redis_set_size)))
                        remove_from_flux_queue_redis_set = []
                # @added 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP
                # Even if flux.last Redis keys are disabled in flux they are used in
                # Vista
                vista_metrics = []
                if not FLUX_CHECK_LAST_TIMESTAMP and VISTA_ENABLED:
                    try:
                        vista_metrics = list(
                            self.redis_conn_decoded.sscan_iter('vista.metrics',
                                                               match='*'))
                    except:
                        vista_metrics = []
Esempio n. 27
0
    def run(self):
        """
        Called when the process intializes.
        """

        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        if os.path.isfile(skyline_app_logwait):
            try:
                logger.info('removing %s' % skyline_app_logwait)
                os.remove(skyline_app_logwait)
            except OSError:
                logger.error('error :: failed to remove %s, continuing' %
                             skyline_app_logwait)
                pass

        now = time()
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = time()
            else:
                now = log_wait_for + 1

        logger.info('starting %s run' % skyline_app)
        if os.path.isfile(skyline_app_loglock):
            logger.error(
                'error :: bin/%s.d log management seems to have failed, continuing'
                % skyline_app)
            try:
                os.remove(skyline_app_loglock)
                logger.info('log lock file removed')
            except OSError:
                logger.error('error :: failed to remove %s, continuing' %
                             skyline_app_loglock)
                pass
        else:
            logger.info('bin/%s.d log management done' % skyline_app)

        # @added 20190417 - Feature #2948: LUMINOSITY_ENABLED setting
        # If Luminosity is not enabled, do nothing
        luminosity_enabled = True
        try:
            luminosity_enabled = settings.LUMINOSITY_ENABLED
            logger.info('LUMINOSITY_ENABLED is set to %s' %
                        str(luminosity_enabled))
        except:
            logger.info(
                'warning :: LUMINOSITY_ENABLED is not declared in settings.py, defaults to True'
            )

        # @added 20190417 - Feature #2950: Report defaulted settings to log
        # Added all the globally declared settings to enable reporting in the
        # log the state of each setting.
        try:
            ENABLE_LUMINOSITY_DEBUG = settings.ENABLE_LUMINOSITY_DEBUG
            logger.info(
                'ENABLE_LUMINOSITY_DEBUG is set from settings.py to %s' %
                str(ENABLE_LUMINOSITY_DEBUG))
        except:
            logger.info(
                'warning :: ENABLE_LUMINOSITY_DEBUG is not declared in settings.py, defaults to False'
            )
            ENABLE_LUMINOSITY_DEBUG = False
        try:
            SERVER_METRIC_PATH = '.%s' % settings.SERVER_METRICS_NAME
            if SERVER_METRIC_PATH == '.':
                SERVER_METRIC_PATH = ''
            logger.info('SERVER_METRIC_PATH is set from settings.py to %s' %
                        str(SERVER_METRIC_PATH))
        except:
            SERVER_METRIC_PATH = ''
            logger.info(
                'warning :: SERVER_METRIC_PATH is not declared in settings.py, defaults to \'\''
            )
        try:
            LUMINOSITY_PROCESSES = settings.LUMINOSITY_PROCESSES
            logger.info('LUMINOSITY_PROCESSES is set from settings.py to %s' %
                        str(LUMINOSITY_PROCESSES))
        except:
            # @modified 20180110 - Task #2266: Evaluate luminol for the luminosity branch
            # It is fast and lightweight
            # luminosity_processes = 2
            LUMINOSITY_PROCESSES = 1
            logger.info(
                'warning :: cannot determine LUMINOSITY_PROCESSES from settings.py, defaults to %s'
                % str(LUMINOSITY_PROCESSES))

        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
                if ENABLE_LUMINOSITY_DEBUG:
                    logger.info('debug :: connected to Redis')
            except:
                logger.error(
                    'error :: cannot connect to redis at socket path %s' %
                    (settings.REDIS_SOCKET_PATH))
                sleep(30)
                # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow
                # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings
                #                      Branch #3262: py3
                # Use get_redis_conn and get_redis_conn_decoded to use on Redis sets when the bytes
                # types need to be decoded as utf-8 to str
                # if settings.REDIS_PASSWORD:
                #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
                # else:
                #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                # @added 20191115 - Bug #3266: py3 Redis binary objects not strings
                #                   Branch #3262: py3
                self.redis_conn = get_redis_conn(skyline_app)
                self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)

                continue

            # Report app up
            try:
                self.redis_conn.setex(skyline_app, 120, now)
                logger.info('updated Redis key for %s up' % skyline_app)
            except:
                logger.error('error :: failed to update Redis key for %s up' %
                             skyline_app)

            # @added 20190417 - Feature #: LUMINOSITY_ENABLED setting
            # If Luminosity is not enabled, do nothing
            if not luminosity_enabled:
                logger.info(
                    'luminosity is not enabled LUMINOSITY_ENABLED set to %s, sleeping for 20 seconds'
                    % str(settings.LUMINOSITY_ENABLED))
                sleep(20)
                continue
            """
            Determine if any new anomalies have been added
            """
            while True:
                process_anomaly_id = None
                last_processed_anomaly_id = None
                memcache_last_processed_anomaly_id_data = False
                # Check memcached before MySQL
                memcache_key = '%s.last.processed.anomaly.id' % skyline_app
                if settings.MEMCACHE_ENABLED:
                    try:
                        # @modified 20191029 - Task #3304: py3 - handle pymemcache bytes not str
                        # last_processed_anomaly_id = self.memcache_client.get(memcache_key)
                        if python_version == 2:
                            last_processed_anomaly_id = self.memcache_client.get(
                                memcache_key)
                        else:
                            last_processed_anomaly_id = self.memcache_client.get(
                                memcache_key).decode('utf-8')
                        # if memcache does not have the key the response to the
                        # client is None, it does not except
                    except:
                        # @modified 20200507 - stop reporting this as an error
                        # it can be expected to happen from time to time
                        # logger.error('error :: failed to get %s from memcache' % memcache_key)
                        logger.info(
                            'failed to get %s from memcache, will query DB' %
                            memcache_key)
                    try:
                        self.memcache_client.close()
                    except:
                        logger.error(
                            'error :: failed to close memcache_client')

                if last_processed_anomaly_id:
                    logger.info(
                        'last_processed_anomaly_id found in memcache - %s' %
                        str(last_processed_anomaly_id))
                    memcache_last_processed_anomaly_id_data = True
                else:
                    # @modified 20190517 - Bug #3016: Handle no anomaly ids in luminosity
                    #                      Branch #3002: docker
                    # Log appropriate to whether memcache is enabled or not
                    if settings.MEMCACHE_ENABLED:
                        logger.info(
                            'last_processed_anomaly_id key was NOT found in memcache - %s'
                            % str(last_processed_anomaly_id))
                    else:
                        logger.info(
                            'memcache not enabled not checking for last_processed_anomaly_id key'
                        )

                if not last_processed_anomaly_id:
                    query = 'SELECT id FROM luminosity WHERE id=(SELECT MAX(id) FROM luminosity) ORDER BY id DESC LIMIT 1'
                    results = None
                    try:
                        results = mysql_select(skyline_app, query)
                    except:
                        logger.error(traceback.format_exc())
                        logger.error('error :: MySQL quey failed - %s' % query)
                    if results:
                        try:
                            last_processed_anomaly_id = int(results[0][0])
                            logger.info(
                                'last_processed_anomaly_id found from DB - %s'
                                % str(last_processed_anomaly_id))
                        except:
                            logger.error(traceback.format_exc())

                        if last_processed_anomaly_id and settings.MEMCACHE_ENABLED:
                            if not memcache_last_processed_anomaly_id_data:
                                logger.info(
                                    'Populating memcache with DB result - %s' %
                                    str(last_processed_anomaly_id))
                                try:
                                    self.memcache_client.set(
                                        memcache_key,
                                        int(last_processed_anomaly_id))
                                    logger.info(
                                        'populated memcache key %s with %s' %
                                        (memcache_key,
                                         str(last_processed_anomaly_id)))
                                except:
                                    logger.error(
                                        'error :: failed to set  the memcache key - %s - %s'
                                        % (memcache_key,
                                           str(last_processed_anomaly_id)))
                                try:
                                    self.memcache_client.close()
                                except:
                                    logger.error(
                                        'error :: failed to close memcache_client'
                                    )

                if not last_processed_anomaly_id:
                    # Check MySQL
                    now = int(time())
                    after = now - 600
                    query = 'SELECT * FROM anomalies WHERE anomaly_timestamp > \'%s\'' % str(
                        after)  # nosec
                    results = None
                    try:
                        results = mysql_select(skyline_app, query)
                    except:
                        logger.error('error :: MySQL quey failed - %s' % query)
                    if results:
                        process_anomaly_id = int(results[0][0])
                        logger.info(
                            'found new anomaly id to process from the DB - %s'
                            % str(process_anomaly_id))
                        # Handle the first one
                        last_processed_anomaly_id = process_anomaly_id - 1
                    else:
                        logger.info('no new anomalies in the anomalies table')

                # @added 20190517 - Bug #3016: Handle no anomaly ids in luminosity
                #                   Branch #3002: docker
                # When Skyline is first installed, if luminosity is enabled it
                # reports errors as there are no anomaly ids
                if str(last_processed_anomaly_id) == 'None':
                    last_processed_anomaly_id = 0

                query = 'SELECT * FROM anomalies WHERE id > \'%s\'' % str(
                    last_processed_anomaly_id)  # nosec
                results = None
                try:
                    results = mysql_select(skyline_app, query)
                except:
                    logger.error('error :: MySQL quey failed - %s' % query)
                if results:
                    try:
                        process_anomaly_id = int(results[0][0])
                        logger.info(
                            'found the next new anomaly id to process from the DB - %s'
                            % str(process_anomaly_id))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error('error :: from query - %s' % query)
                else:
                    logger.info('no new anomalies in the anomalies table')

                if process_anomaly_id and last_processed_anomaly_id:
                    if isinstance(last_processed_anomaly_id, int):
                        if isinstance(process_anomaly_id, int):
                            if last_processed_anomaly_id == process_anomaly_id:
                                logger.info(
                                    'anomaly id already processed - %s' %
                                    str(process_anomaly_id))
                                process_anomaly_id = None

                if not process_anomaly_id:
                    logger.info(
                        'sleeping 20 no anomalies to correlate - last processed anomaly id - %s'
                        % str(last_processed_anomaly_id))
                    sleep(20)
                    up_now = time()
                    # Report app up
                    try:
                        self.redis_conn.setex(skyline_app, 120, up_now)
                        logger.info('updated Redis key for %s up' %
                                    skyline_app)
                    except:
                        logger.error(
                            'error :: failed to update Redis key for %s up' %
                            skyline_app)

                cache_key = '%s.sent_graphite_metrics' % skyline_app
                redis_sent_graphite_metrics = False
                try:
                    redis_sent_graphite_metrics = self.redis_conn.get(
                        cache_key)
                except Exception as e:
                    logger.error(
                        'error :: could not query Redis for key %s: %s' %
                        (cache_key, e))

                # Flush metrics to Graphite
                if not redis_sent_graphite_metrics:
                    try:
                        # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage
                        # correlations = str(len(self.correlations))
                        # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings
                        #                      Branch #3262: py3
                        # correlations = str(len(list(self.redis_conn.smembers('luminosity.correlations'))))
                        correlations = str(
                            len(
                                list(
                                    self.redis_conn_decoded.smembers(
                                        'luminosity.correlations'))))
                    except:
                        correlations = '0'
                    logger.info('correlations       :: %s' % correlations)
                    send_metric_name = '%s.correlations' % skyline_app_graphite_namespace
                    send_graphite_metric(skyline_app, send_metric_name,
                                         correlations)

                    # @added 20190522 - Task #3034: Reduce multiprocessing Manager list usage
                    try:
                        # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings
                        #                      Branch #3262: py3
                        # runtimes = list(self.redis_conn.smembers('luminosity.runtimes'))
                        runtimes = list(
                            self.redis_conn_decoded.smembers(
                                'luminosity.runtimes'))
                    except:
                        runtimes = []

                    # @added 20180720 - Task #2462: Implement useful metrics for Luminosity
                    #                   Branch #2270: luminosity
                    # runtime metric to monitor the time it takes to process
                    # correlations
                    try:
                        # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage
                        # if len(self.runtimes) > 1:
                        #     avg_runtime = sum(self.runtimes) / len(self.runtimes)
                        # else:
                        #     avg_runtime = sum(self.runtimes)
                        if len(runtimes) > 1:
                            avg_runtime = sum(runtimes) / len(runtimes)
                        else:
                            avg_runtime = sum(runtimes)
                    except:
                        avg_runtime = '0'
                    logger.info('avg_runtime       :: %s' % str(avg_runtime))
                    send_metric_name = '%s.avg_runtime' % skyline_app_graphite_namespace
                    send_graphite_metric(skyline_app, send_metric_name,
                                         str(avg_runtime))
                    try:
                        # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage
                        # metrics_checked_for_correlation = str(sum(self.metrics_checked_for_correlation))
                        # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings
                        #                      Branch #3262: py3
                        # metrics_checked_for_correlation = str(len(list(self.redis_conn.smembers('luminosity.metrics_checked_for_correlation'))))
                        metrics_checked_for_correlation = str(
                            len(
                                list(
                                    self.redis_conn_decoded.smembers(
                                        'luminosity.metrics_checked_for_correlation'
                                    ))))
                    except:
                        metrics_checked_for_correlation = '0'
                    logger.info('metrics_checked_for_correlation   :: %s' %
                                metrics_checked_for_correlation)
                    send_metric_name = '%s.metrics_checked_for_correlation' % skyline_app_graphite_namespace
                    send_graphite_metric(skyline_app, send_metric_name,
                                         metrics_checked_for_correlation)
                    sent_graphite_metrics_now = int(time())
                    try:
                        self.redis_conn.setex(cache_key, 59,
                                              sent_graphite_metrics_now)
                        logger.info('updated Redis key - %s' % cache_key)
                    except:
                        logger.error(
                            'error :: failed to update Redis key - %s up' %
                            cache_key)

                    # Reset lists
                    # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage
                    # self.correlations[:] = []
                    # @added 20180720 - Task #2462: Implement useful metrics for Luminosity
                    # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage
                    # self.runtimes[:] = []
                    # self.metrics_checked_for_correlation[:] = []

                    # @added 20190522 - Task #3034: Reduce multiprocessing Manager list usage
                    # Use Redis sets instead of Manager().list()
                    delete_redis_sets = [
                        'luminosity.correlations', 'luminosity.runtimes',
                        'luminosity.metrics_checked_for_correlation'
                    ]
                    for i_redis_set in delete_redis_sets:
                        redis_set_to_delete = i_redis_set
                        try:
                            self.redis_conn.delete(redis_set_to_delete)
                            logger.info('deleted Redis set - %s' %
                                        redis_set_to_delete)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: failed to delete Redis set - %s' %
                                redis_set_to_delete)

                # @added 20180720 - Task #2462: Implement useful metrics for Luminosity
                #                   Feature #2464: luminosity_remote_data
                # Added the ability to add a Redis key to overview the memcached
                # key luminosity.last.processed.anomaly.id some it does not have
                # to be changed via telnet to memcache.
                if not process_anomaly_id or not redis_sent_graphite_metrics:
                    cache_key = '%s.last.processed.anomaly.id' % skyline_app
                    redis_last_processed_anomaly_id_redis_key = False
                    try:
                        redis_last_processed_anomaly_id_redis_key = self.redis_conn.get(
                            cache_key)
                    except Exception as e:
                        logger.error(
                            'error :: could not query Redis for key %s: %s' %
                            (cache_key, e))
                    if redis_last_processed_anomaly_id_redis_key:
                        logger.info(
                            'found Redis %s key to override the mecache key setting process_anomaly_id to %s'
                            % (cache_key,
                               str(redis_last_processed_anomaly_id_redis_key)))
                        try:
                            process_anomaly_id = int(
                                redis_last_processed_anomaly_id_redis_key)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: failed to set process_anomaly_id from Rdis override key value'
                            )
                        # And remove the Redis override key as it is only meant
                        # to override once to allow for a replay for debug
                        # purposes only.
                        try:
                            self.redis_conn.setex(
                                cache_key, 1,
                                int(redis_last_processed_anomaly_id_redis_key))
                            logger.info('updated Redis key - %s' % cache_key)
                        except:
                            logger.error(
                                'error :: failed to update Redis key - %s up to 1 second expiring to delete it.'
                                % cache_key)

                if process_anomaly_id:
                    break

            # Spawn process
            logger.info('spawning processes to correlate anomaly id %s' %
                        str(process_anomaly_id))
            pids = []
            spawned_pids = []
            pid_count = 0
            now = time()
            for i in range(1, LUMINOSITY_PROCESSES + 1):
                try:
                    p = Process(target=self.spin_process,
                                args=(i, process_anomaly_id))
                    pids.append(p)
                    pid_count += 1
                    logger.info('starting %s of %s spin_process/es' %
                                (str(pid_count), str(LUMINOSITY_PROCESSES)))
                    p.start()
                    spawned_pids.append(p.pid)
                except:
                    logger.error(traceback.format_exc())
                    logger.error('error :: failed to start spin_process')
                    continue

            # Self monitor processes and terminate if any spin_process has run
            # for to long
            p_starts = time()
            while time() - p_starts <= 60:
                if any(p.is_alive() for p in pids):
                    # Just to avoid hogging the CPU
                    sleep(.1)
                else:
                    # All the processes are done, break now.
                    time_to_run = time() - p_starts
                    logger.info('%s spin_process completed in %.2f seconds' %
                                (str(LUMINOSITY_PROCESSES), time_to_run))
                    break
            else:
                # We only enter this if we didn't 'break' above.
                logger.info('timed out, killing all spin_process processes')
                for p in pids:
                    try:
                        p.terminate()
                        # p.join()
                        logger.info('killed spin_process process')
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: killing all spin_process processes')

            for p in pids:
                if p.is_alive():
                    logger.info('stopping spin_process - %s' %
                                (str(p.is_alive())))
                    p.join()

            process_runtime = time() - now
            if process_runtime < 10:
                sleep_for = (10 - process_runtime)
                logger.info(
                    'sleeping for %.2f seconds due to low run time...' %
                    sleep_for)
                sleep(sleep_for)
                try:
                    del sleep_for
                except:
                    logger.error('error :: failed to del sleep_for')
            try:
                del process_runtime
            except:
                logger.error('error :: failed to del process_runtime')