Exemple #1
0
    def listen_udp(self):
        """
        Listen over udp for MessagePack strings
        """
        while 1:
            try:
                s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
                s.bind((self.ip, self.port))
                logger.info('%s :: listening over udp for messagepack on %s' % (skyline_app, self.port))

                chunk = []
                while 1:
                    self.check_if_parent_is_alive()
                    data, addr = s.recvfrom(1024)

                    # @modified 20191014 - Task #3272: horizon - listen - py3 handle msgpack bytes
                    #                      Branch #3262: py3
                    #                      Bug #3266: py3 Redis binary objects not strings
                    # msgpack encoding of bytes and not str as per
                    # https://msgpack.org/#string-and-binary-type Python/msgpack
                    # and https://stackoverflow.com/a/47070687/107406
                    # metric = unpackb(data)
                    if python_version == 3:
                        metric = unpackb(data, encoding='utf-8')
                    else:
                        metric = unpackb(data)

                    chunk.append(metric)

                    # Queue the chunk and empty the variable
                    if len(chunk) > settings.CHUNK_SIZE:
                        try:
                            self.q.put(list(chunk), block=False)
                            chunk[:] = []

                        # Drop chunk if queue is full
                        except Full:
                            chunks_dropped = str(len(chunk))
                            logger.info(
                                '%s :: UDP queue is full, dropping %s datapoints'
                                % (skyline_app, chunks_dropped))
                            send_metric_name = '%s.udp_chunks_dropped' % skyline_app_graphite_namespace
                            send_graphite_metric(skyline_app, send_metric_name, chunks_dropped)
                            chunk[:] = []

            except Exception as e:
                logger.info('%s :: cannot connect to socket: %s' % (skyline_app, str(e)))
                break
Exemple #2
0
    def listen_udp(self):
        """
        Listen over udp for MessagePack strings
        """
        while 1:
            try:
                s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
                s.bind((self.ip, self.port))
                logger.info('%s :: listening over udp for messagepack on %s' %
                            (skyline_app, self.port))

                chunk = []
                while 1:
                    self.check_if_parent_is_alive()
                    data, addr = s.recvfrom(1024)
                    metric = unpackb(data)
                    chunk.append(metric)

                    # Queue the chunk and empty the variable
                    if len(chunk) > settings.CHUNK_SIZE:
                        try:
                            self.q.put(list(chunk), block=False)
                            chunk[:] = []

                        # Drop chunk if queue is full
                        except Full:
                            chunks_dropped = str(len(chunk))
                            logger.info(
                                '%s :: UDP queue is full, dropping %s datapoints'
                                % (skyline_app, chunks_dropped))
                            send_metric_name = '%s.udp_chunks_dropped' % skyline_app_graphite_namespace
                            send_graphite_metric(skyline_app, send_metric_name,
                                                 chunks_dropped)
                            chunk[:] = []

            except Exception as e:
                logger.info('%s :: cannot connect to socket: %s' %
                            (skyline_app, str(e)))
                break
Exemple #3
0
    def run(self):
        """
        - Called when the process intializes.

        - Determine if Redis is up and discover the number of `unique metrics`.

        - Divide the `unique_metrics` between the number of `ANALYZER_PROCESSES`
          and assign each process a set of metrics to analyse for anomalies.

        - Wait for the processes to finish.

        - Determine whether if any anomalous metrics require:

            - Alerting on (and set `EXPIRATION_TIME` key in Redis for alert).
            - Feed to another module e.g. mirage.
            - Alert to syslog.

        - Populate the webapp json with the anomalous_metrics details.

        - Log the details about the run to the skyline analyzer log.

        - Send skyline.analyzer metrics to `GRAPHITE_HOST`
        """

        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        if os.path.isfile(skyline_app_logwait):
            try:
                os.remove(skyline_app_logwait)
            except OSError:
                logger.error('error - failed to remove %s, continuing' % skyline_app_logwait)
                pass

        now = time()
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = time()
            else:
                now = log_wait_for + 1

        logger.info('starting %s run' % skyline_app)
        if os.path.isfile(skyline_app_loglock):
            logger.error('error - bin/%s.d log management seems to have failed, continuing' % skyline_app)
            try:
                os.remove(skyline_app_loglock)
                logger.info('log lock file removed')
            except OSError:
                logger.error('error - failed to remove %s, continuing' % skyline_app_loglock)
                pass
        else:
            logger.info('bin/%s.d log management done' % skyline_app)

        if not os.path.exists(settings.SKYLINE_TMP_DIR):
            if python_version == 2:
                mode_arg = int('0755')
            if python_version == 3:
                mode_arg = mode=0o755
            os.makedirs(settings.SKYLINE_TMP_DIR, mode_arg)

        # Initiate the algorithm timings if Analyzer is configured to send the
        # algorithm_breakdown metrics with ENABLE_ALGORITHM_RUN_METRICS
        algorithm_tmp_file_prefix = settings.SKYLINE_TMP_DIR + '/' + skyline_app + '.'
        algorithms_to_time = []
        if send_algorithm_run_metrics:
            algorithms_to_time = settings.ALGORITHMS

        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except:
                logger.error('skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH)
                sleep(10)
                self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                continue

            # Report app up
            self.redis_conn.setex(skyline_app, 120, now)

            # Discover unique metrics
            unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics'))

            if len(unique_metrics) == 0:
                logger.info('no metrics in redis. try adding some - see README')
                sleep(10)
                continue

            # Using count files rather that multiprocessing.Value to enable metrics for
            # metrics for algorithm run times, etc
            for algorithm in algorithms_to_time:
                algorithm_count_file = algorithm_tmp_file_prefix + algorithm + '.count'
                algorithm_timings_file = algorithm_tmp_file_prefix + algorithm + '.timings'
                # with open(algorithm_count_file, 'a') as f:
                with open(algorithm_count_file, 'w') as f:
                    pass
                with open(algorithm_timings_file, 'w') as f:
                    pass

            # Remove any existing algorithm.error files from any previous runs
            # that did not cleanup for any reason
            pattern = '%s.*.algorithm.error' % skyline_app
            try:
                for f in os.listdir(settings.SKYLINE_TMP_DIR):
                    if re.search(pattern, f):
                        try:
                            os.remove(os.path.join(settings.SKYLINE_TMP_DIR, f))
                            logger.info('cleaning up old error file - %s' % (str(f)))
                        except OSError:
                            pass
            except:
                logger.error('failed to cleanup algorithm.error files ' + traceback.format_exc())

            # Spawn processes
            pids = []
            spawned_pids = []
            pid_count = 0
            for i in range(1, settings.ANALYZER_PROCESSES + 1):
                if i > len(unique_metrics):
                    logger.info('WARNING: skyline is set for more cores than needed.')
                    break

                p = Process(target=self.spin_process, args=(i, unique_metrics))
                pids.append(p)
                pid_count += 1
                logger.info('starting %s of %s spin_process/es' % (str(pid_count), str(settings.ANALYZER_PROCESSES)))
                p.start()
                spawned_pids.append(p.pid)

            # Send wait signal to zombie processes
            # for p in pids:
            #     p.join()
            # Self monitor processes and terminate if any spin_process has run
            # for longer than 180 seconds - 20160512 @earthgecko
            p_starts = time()
            while time() - p_starts <= settings.MAX_ANALYZER_PROCESS_RUNTIME:
                if any(p.is_alive() for p in pids):
                    # Just to avoid hogging the CPU
                    sleep(.1)
                else:
                    # All the processes are done, break now.
                    time_to_run = time() - p_starts
                    logger.info('%s :: %s spin_process/es completed in %.2f seconds' % (skyline_app, str(settings.ANALYZER_PROCESSES), time_to_run))
                    break
            else:
                # We only enter this if we didn't 'break' above.
                logger.info('%s :: timed out, killing all spin_process processes' % (skyline_app))
                for p in pids:
                    p.terminate()
                    p.join()

            # Log the last reported error by any algorithms that errored in the
            # spawned processes from algorithms.py
            for completed_pid in spawned_pids:
                logger.info('spin_process with pid %s completed' % (str(completed_pid)))
                for algorithm in settings.ALGORITHMS:
                    algorithm_error_file = '%s/%s.%s.%s.algorithm.error' % (
                        settings.SKYLINE_TMP_DIR, skyline_app,
                        str(completed_pid), algorithm)
                    if os.path.isfile(algorithm_error_file):
                        logger.info(
                            'error - spin_process with pid %s has reported an error with the %s algorithm' % (
                                str(completed_pid), algorithm))
                        try:
                            with open(algorithm_error_file, 'r') as f:
                                error_string = f.read()
                            logger.error('%s' % str(error_string))
                        except:
                            logger.error('failed to read %s error file' % algorithm)
                        try:
                            os.remove(algorithm_error_file)
                        except OSError:
                            pass

            # Grab data from the queue and populate dictionaries
            exceptions = dict()
            anomaly_breakdown = dict()
            while 1:
                try:
                    key, value = self.anomaly_breakdown_q.get_nowait()
                    if key not in anomaly_breakdown.keys():
                        anomaly_breakdown[key] = value
                    else:
                        anomaly_breakdown[key] += value
                except Empty:
                    break

            while 1:
                try:
                    key, value = self.exceptions_q.get_nowait()
                    if key not in exceptions.keys():
                        exceptions[key] = value
                    else:
                        exceptions[key] += value
                except Empty:
                    break

            # Send alerts
            if settings.ENABLE_ALERTS:
                for alert in settings.ALERTS:
                    for metric in self.anomalous_metrics:
                        ALERT_MATCH_PATTERN = alert[0]
                        METRIC_PATTERN = metric[1]
                        alert_match_pattern = re.compile(ALERT_MATCH_PATTERN)
                        pattern_match = alert_match_pattern.match(METRIC_PATTERN)
                        if pattern_match:
                            cache_key = 'last_alert.%s.%s' % (alert[1], metric[1])
                            try:
                                last_alert = self.redis_conn.get(cache_key)
                                if not last_alert:
                                    try:
                                        SECOND_ORDER_RESOLUTION_FULL_DURATION = alert[3]
                                        logger.info('mirage check      :: %s' % (metric[1]))
                                        # Write anomalous metric to test at second
                                        # order resolution by crucible to the check
                                        # file
                                        metric_timestamp = int(time())
                                        anomaly_check_file = '%s/%s.%s.txt' % (settings.MIRAGE_CHECK_PATH, metric_timestamp, metric[1])
                                        with open(anomaly_check_file, 'w') as fh:
                                            # metric_name, anomalous datapoint, hours to resolve, timestamp
                                            fh.write('metric = "%s"\nvalue = "%s"\nhours_to_resolve = "%s"\nmetric_timestamp = "%s"\n' % (metric[1], metric[0], alert[3], metric_timestamp))
                                        if python_version == 2:
                                            mode_arg = int('0644')
                                        if python_version == 3:
                                            mode_arg = '0o644'
                                        os.chmod(anomaly_check_file, mode_arg)

                                        logger.info('added mirage check :: %s,%s,%s' % (metric[1], metric[0], alert[3]))
                                        # Add to the mirage_metrics list
                                        base_name = METRIC_PATTERN.replace(settings.FULL_NAMESPACE, '', 1)
                                        metric = [metric[0], base_name]
                                        self.mirage_metrics.append(metric)
                                        # Alert for analyzer if enabled
                                        if settings.ENABLE_FULL_DURATION_ALERTS:
                                            self.redis_conn.setex(cache_key, alert[2], packb(metric[0]))
                                            trigger_alert(alert, metric)
                                    except:
                                        self.redis_conn.setex(cache_key, alert[2], packb(metric[0]))
                                        trigger_alert(alert, metric)
                            except Exception as e:
                                logger.error('error :: could not send alert: %s' % e)

            # Push to crucible
#            if len(self.crucible_anomalous_metrics) > 0:
#                logger.info('to do - push to crucible')

            # Write anomalous_metrics to static webapp directory
            if len(self.anomalous_metrics) > 0:
                filename = path.abspath(path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP))
                with open(filename, 'w') as fh:
                    # Make it JSONP with a handle_data() function
                    anomalous_metrics = list(self.anomalous_metrics)
                    anomalous_metrics.sort(key=operator.itemgetter(1))
                    fh.write('handle_data(%s)' % anomalous_metrics)

            # Using count files rather that multiprocessing.Value to enable metrics for
            # metrics for algorithm run times, etc
            for algorithm in algorithms_to_time:
                algorithm_count_file = algorithm_tmp_file_prefix + algorithm + '.count'
                algorithm_timings_file = algorithm_tmp_file_prefix + algorithm + '.timings'

                try:
                    algorithm_count_array = []
                    with open(algorithm_count_file, 'r') as f:
                        for line in f:
                            value_string = line.replace('\n', '')
                            unquoted_value_string = value_string.replace("'", '')
                            float_value = float(unquoted_value_string)
                            algorithm_count_array.append(float_value)
                except:
                    algorithm_count_array = False

                if not algorithm_count_array:
                    continue

                number_of_times_algorithm_run = len(algorithm_count_array)
                logger.info(
                    'algorithm run count - %s run %s times' % (
                        algorithm, str(number_of_times_algorithm_run)))
                if number_of_times_algorithm_run == 0:
                    continue

                try:
                    algorithm_timings_array = []
                    with open(algorithm_timings_file, 'r') as f:
                        for line in f:
                            value_string = line.replace('\n', '')
                            unquoted_value_string = value_string.replace("'", '')
                            float_value = float(unquoted_value_string)
                            algorithm_timings_array.append(float_value)
                except:
                    algorithm_timings_array = False

                if not algorithm_timings_array:
                    continue

                number_of_algorithm_timings = len(algorithm_timings_array)
                logger.info(
                    'algorithm timings count - %s has %s timings' % (
                        algorithm, str(number_of_algorithm_timings)))

                if number_of_algorithm_timings == 0:
                    continue

                try:
                    _sum_of_algorithm_timings = sum(algorithm_timings_array)
                except:
                    logger.error("sum error: " + traceback.format_exc())
                    _sum_of_algorithm_timings = round(0.0, 6)
                    logger.error('error - sum_of_algorithm_timings - %s' % (algorithm))
                    continue

                sum_of_algorithm_timings = round(_sum_of_algorithm_timings, 6)
                # logger.info('sum_of_algorithm_timings - %s - %.16f seconds' % (algorithm, sum_of_algorithm_timings))

                try:
                    _median_algorithm_timing = determine_median(algorithm_timings_array)
                except:
                    _median_algorithm_timing = round(0.0, 6)
                    logger.error('error - _median_algorithm_timing - %s' % (algorithm))
                    continue
                median_algorithm_timing = round(_median_algorithm_timing, 6)
                # logger.info('median_algorithm_timing - %s - %.16f seconds' % (algorithm, median_algorithm_timing))

                logger.info(
                    'algorithm timing - %s - total: %.6f - median: %.6f' % (
                        algorithm, sum_of_algorithm_timings,
                        median_algorithm_timing))
                use_namespace = skyline_app_graphite_namespace + '.algorithm_breakdown.' + algorithm
                send_metric_name = use_namespace + '.timing.times_run'
                send_graphite_metric(skyline_app, send_metric_name, str(number_of_algorithm_timings))
                send_metric_name = use_namespace + '.timing.total_time'
                send_graphite_metric(skyline_app, send_metric_name, str(sum_of_algorithm_timings))
                send_metric_name = use_namespace + '.timing.median_time'
                send_graphite_metric(skyline_app, send_metric_name, str(median_algorithm_timing))

            run_time = time() - now
            total_metrics = str(len(unique_metrics))
            total_analyzed = str(len(unique_metrics) - sum(exceptions.values()))
            total_anomalies = str(len(self.anomalous_metrics))

            # Log progress
            logger.info('seconds to run    :: %.2f' % run_time)
            logger.info('total metrics     :: %s' % total_metrics)
            logger.info('total analyzed    :: %s' % total_analyzed)
            logger.info('total anomalies   :: %s' % total_anomalies)
            logger.info('exception stats   :: %s' % exceptions)
            logger.info('anomaly breakdown :: %s' % anomaly_breakdown)

            # Log to Graphite
            graphite_run_time = '%.2f' % run_time
            send_metric_name = skyline_app_graphite_namespace + '.run_time'
            send_graphite_metric(skyline_app, send_metric_name, graphite_run_time)

            send_metric_name = skyline_app_graphite_namespace + '.total_analyzed'
            send_graphite_metric(skyline_app, send_metric_name, total_analyzed)

            send_metric_name = skyline_app_graphite_namespace + '.total_anomalies'
            send_graphite_metric(skyline_app, send_metric_name, total_anomalies)

            send_metric_name = skyline_app_graphite_namespace + '.total_metrics'
            send_graphite_metric(skyline_app, send_metric_name, total_metrics)
            for key, value in exceptions.items():
                send_metric_name = '%s.exceptions.%s' % (skyline_app_graphite_namespace, key)
                send_graphite_metric(skyline_app, send_metric_name, str(value))
            for key, value in anomaly_breakdown.items():
                send_metric_name = '%s.anomaly_breakdown.%s' % (skyline_app_graphite_namespace, key)
                send_graphite_metric(skyline_app, send_metric_name, str(value))

            # Check canary metric
            raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC)
            if raw_series is not None:
                unpacker = Unpacker(use_list=False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)
                time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600
                projected = 24 * (time() - now) / time_human

                logger.info('canary duration   :: %.2f' % time_human)
                send_metric_name = skyline_app_graphite_namespace + '.duration'
                send_graphite_metric(skyline_app, send_metric_name, str(time_human))

                send_metric_name = skyline_app_graphite_namespace + '.projected'
                send_graphite_metric(skyline_app, send_metric_name, str(projected))

            # Reset counters
            self.anomalous_metrics[:] = []

            # Sleep if it went too fast
            # if time() - now < 5:
            #    logger.info('sleeping due to low run time...')
            #    sleep(10)
            # @modified 20160504 - @earthgecko - development internal ref #1338, #1340)
            # Etsy's original for this was a value of 5 seconds which does
            # not make skyline Analyzer very efficient in terms of installations
            # where 100s of 1000s of metrics are being analyzed.  This lead to
            # Analyzer running over several metrics multiple time in a minute
            # and always working.  Therefore this was changed from if you took
            # less than 5 seconds to run only then sleep.  This behaviour
            # resulted in Analyzer analysing a few 1000 metrics in 9 seconds and
            # then doing it again and again in a single minute.  Therefore the
            # ANALYZER_OPTIMUM_RUN_DURATION setting was added to allow this to
            # self optimise in cases where skyline is NOT deployed to analyze
            # 100s of 1000s of metrics.  This relates to optimising performance
            # for any deployments in the few 1000s and 60 second resolution
            # area, e.g. smaller and local deployments.
            process_runtime = time() - now
            analyzer_optimum_run_duration = settings.ANALYZER_OPTIMUM_RUN_DURATION
            if process_runtime < analyzer_optimum_run_duration:
                sleep_for = (analyzer_optimum_run_duration - process_runtime)
                logger.info('sleeping for %.2f seconds due to low run time...' % sleep_for)
                sleep(sleep_for)
Exemple #4
0
    def run(self):
        """
        Called when the process intializes.
        """
        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        # @modified 20201017 - Feature #3788: snab_flux_load_test
        #                      Feature #3680: horizon.worker.datapoints_sent_to_redis
        # Only do log management with the canary worker
        if self.canary:
            if os.path.isfile(skyline_app_logwait):
                try:
                    os_remove(skyline_app_logwait)
                except OSError:
                    logger.error('error - failed to remove %s, continuing' %
                                 skyline_app_logwait)
                    pass

        now = time()
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = time()
            else:
                now = log_wait_for + 1

        logger.info('starting %s worker number %s run' %
                    (skyline_app, self.worker_number))
        # @modified 20201017 - Feature #3788: snab_flux_load_test
        #                      Feature #3680: horizon.worker.datapoints_sent_to_redis
        if self.canary:
            if os.path.isfile(skyline_app_loglock):
                logger.error(
                    'error - bin/%s.d log management seems to have failed, continuing'
                    % skyline_app)
                try:
                    os_remove(skyline_app_loglock)
                    logger.info('log lock file removed')
                except OSError:
                    logger.error('error - failed to remove %s, continuing' %
                                 skyline_app_loglock)
                    pass
            else:
                logger.info('bin/%s.d log management done' % skyline_app)

        logger.info('%s :: started worker %s' %
                    (skyline_app, str(self.worker_number)))

        # @added 20220216 - Feature #4446: Optimise horizon worker in_skip_list
        skip_metrics_dict = {}
        do_not_skip_metrics_dict = {}
        skip_metrics_list = []
        do_not_skip_metrics_list = []
        checked_for_skip_count = 0

        # @added 20201103 - Feature #3820: HORIZON_SHARDS
        if HORIZON_SHARDS:
            logger.info(
                '%s :: HORIZON_SHARDS declared, this horizon instance is assigned shard %s'
                % (skyline_app, str(HORIZON_SHARD)))

        FULL_NAMESPACE = settings.FULL_NAMESPACE
        MINI_NAMESPACE = settings.MINI_NAMESPACE
        MAX_RESOLUTION = settings.MAX_RESOLUTION
        full_uniques = '%sunique_metrics' % FULL_NAMESPACE
        mini_uniques = '%sunique_metrics' % MINI_NAMESPACE
        pipe = self.redis_conn.pipeline()

        last_send_to_graphite = time()
        queue_sizes = []
        # @added 20210416 - Task #4020: Add horizon.queue_size_60s_avg metric
        last_log_queue_size = time()
        queue_sizes_60_seconds = []

        # @added 20200815 - Feature #3680: horizon.worker.datapoints_sent_to_redis
        datapoints_sent_to_redis = 0
        last_datapoints_to_redis = int(time())

        # @added 20201103 - Feature #3820: HORIZON_SHARDS
        if HORIZON_SHARDS:
            horizon_shard_assigned_metrics = []
            horizon_shard_dropped_metrics = []
        # @added 20201120 - Feature #3820: HORIZON_SHARDS
        metrics_received = []

        # python-2.x and python3.x handle while 1 and while True differently
        # while 1:
        running = True
        while running:

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except:
                logger.error(
                    '%s :: cannot connect to Redis at socket path %s' %
                    (skyline_app, settings.REDIS_SOCKET_PATH))
                sleep(10)
                # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow
                if settings.REDIS_PASSWORD:
                    # @modified 20191014 - Bug #3266: py3 Redis binary objects not strings
                    #                      Branch #3262: py3
                    # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
                    self.redis_conn = StrictRedis(
                        password=settings.REDIS_PASSWORD,
                        unix_socket_path=settings.REDIS_SOCKET_PATH,
                        charset='utf-8',
                        decode_responses=True)
                else:
                    # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                    self.redis_conn = StrictRedis(
                        unix_socket_path=settings.REDIS_SOCKET_PATH,
                        charset='utf-8',
                        decode_responses=True)
                pipe = self.redis_conn.pipeline()
                continue

            # @added 20220216 - Feature #4446: Optimise horizon worker in_skip_list
            if not skip_metrics_dict:
                try:
                    skip_metrics_dict = self.redis_conn_decoded.hgetall(
                        'horizon.skip_metrics')
                    logger.info(
                        '%s :: got %s metrics from horizon.skip_metrics from Redis hash as not set'
                        % (skyline_app, str(len(skip_metrics_dict))))
                except Exception as err:
                    logger.error(
                        '%s :: error on hgetall horizon.skip_metrics: %s' %
                        (skyline_app, str(err)))
            if not do_not_skip_metrics_dict:
                try:
                    do_not_skip_metrics_dict = self.redis_conn_decoded.hgetall(
                        'horizon.do_not_skip_metrics')
                    logger.info(
                        '%s :: got %s metrics from horizon.do_not_skip_metrics from Redis hash as not set'
                        % (skyline_app, str(len(do_not_skip_metrics_dict))))
                except Exception as err:
                    logger.error(
                        '%s :: error on hgetall horizon.do_not_skip_metrics: %s'
                        % (skyline_app, str(err)))

            try:
                # Get a chunk from the queue with a 15 second timeout
                chunk = self.q.get(True, 15)
                # @modified 20170317 - Feature #1978: worker - DO_NOT_SKIP_LIST
                # now = time()
                now = int(time())

                for metric in chunk:

                    # @added 20201120 - Feature #3820: HORIZON_SHARDS
                    try:
                        metrics_received.append(metric[0])
                    except:
                        pass

                    # @added 20201103 - Feature #3820: HORIZON_SHARDS
                    # If a metric does not map to the HORIZON_SHARD, drop it
                    # and continue
                    if HORIZON_SHARDS and number_of_horizon_shards:
                        if not self.in_shard(metric[0]):
                            horizon_shard_dropped_metrics.append(metric[0])
                            continue
                        else:
                            horizon_shard_assigned_metrics.append(metric[0])

                    # @added 20220216 - Feature #4446: Optimise horizon worker in_skip_list
                    found_in_do_not_skip_dict = False
                    if do_not_skip_metrics_dict:
                        try:
                            found_in_do_not_skip_dict = do_not_skip_metrics_dict[
                                metric[0]]
                        except KeyError:
                            found_in_do_not_skip_dict = False
                        if found_in_do_not_skip_dict:
                            do_not_skip_metrics_list.append(metric[0])
                    found_in_skip_dict = False
                    if not found_in_do_not_skip_dict and skip_metrics_dict:
                        try:
                            found_in_skip_dict = skip_metrics_dict[metric[0]]
                        except KeyError:
                            found_in_skip_dict = False
                        if found_in_skip_dict:
                            skip_metrics_list.append(metric[0])
                            continue

                    # @modified 20220216 - Feature #4446: Optimise horizon worker in_skip_list
                    if not found_in_do_not_skip_dict:
                        checked_for_skip_count += 1

                        # Check if we should skip it
                        if self.in_skip_list(metric[0]):

                            # @added 20220216 - Feature #4446: Optimise horizon worker in_skip_list
                            skip_metrics_list.append(metric[0])

                            continue

                    # @added 20220216 - Feature #4446: Optimise horizon worker in_skip_list
                    if not found_in_do_not_skip_dict:
                        do_not_skip_metrics_list.append(metric[0])

                    # Bad data coming in
                    # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7
                    #                      Branch #3262: py3
                    #                      Bug #3266: py3 Redis binary objects not strings
                    # if metric[1][0] < now - MAX_RESOLUTION:
                    #     continue
                    try:
                        if int(metric[1][0]) < now - MAX_RESOLUTION:
                            continue
                    except:
                        pass

                    # Append to messagepack main namespace
                    # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7
                    #                      Branch #3262: py3
                    #                      Bug #3266: py3 Redis binary objects not strings
                    # key = ''.join((FULL_NAMESPACE, metric[0]))
                    key = ''.join((FULL_NAMESPACE, str(metric[0])))
                    if LOCAL_DEBUG:
                        logger.info(
                            'debug :: worker :: adding metric Redis key - %s' %
                            str(key))

                    # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7
                    #                      Branch #3262: py3
                    #                      Bug #3266: py3 Redis binary objects not strings
                    # pipe.append(key, packb(metric[1]))
                    # pipe.sadd(full_uniques, key)
                    try:
                        pipe.append(str(key), packb(metric[1]))
                        # @added 20200815 - Feature #3680: horizon.worker.datapoints_sent_to_redis
                        datapoints_sent_to_redis += 1
                    except Exception as e:
                        logger.error('%s :: error on pipe.append: %s' %
                                     (skyline_app, str(e)))
                    try:
                        # pipe.sadd(full_uniques, key)
                        pipe.sadd(full_uniques, str(key))
                    except Exception as e:
                        logger.error('%s :: error on pipe.sadd: %s' %
                                     (skyline_app, str(e)))

                    if not self.skip_mini:
                        # Append to mini namespace
                        # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7
                        #                      Branch #3262: py3
                        #                      Bug #3266: py3 Redis binary objects not strings
                        # mini_key = ''.join((MINI_NAMESPACE, metric[0]))
                        mini_key = ''.join((MINI_NAMESPACE, str(metric[0])))
                        pipe.append(mini_key, packb(metric[1]))
                        pipe.sadd(mini_uniques, mini_key)

                    # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7
                    #                      Branch #3262: py3
                    #                      Bug #3266: py3 Redis binary objects not strings
                    # pipe.execute()
                    try:
                        pipe.execute()
                    except Exception as e:
                        logger.error('%s :: error on pipe.execute: %s' %
                                     (skyline_app, str(e)))
            except Empty:
                logger.info('%s :: worker queue is empty and timed out' %
                            skyline_app)
            except WatchError:
                logger.error('%s :: WatchError - %s' % (skyline_app, str(key)))
            except NotImplementedError:
                pass
            except Exception as e:
                # @added 20190130 - Task #2690: Test Skyline on Python-3.6.7
                #                   Branch #3262: py3
                #                   Bug #3266: py3 Redis binary objects not strings
                # Added traceback
                logger.error(traceback.format_exc())
                logger.error('%s :: error: %s' % (skyline_app, str(e)))

            # Log progress
            if self.canary:
                # @modified 20210511
                # Do not logg queue size for every chunk
                # logger.info('%s :: queue size at %d' % (skyline_app, self.q.qsize()))

                queue_sizes.append(self.q.qsize())
                # Only send average queue mertics to graphite once per 10 seconds
                now = time()

                # @modified 20210416 - Task #4020: Add horizon.queue_size_60s_avg metric
                # last_sent_graphite = now - last_send_to_graphite
                # if last_sent_graphite > 10:
                last_logged_queue_size = now - last_log_queue_size
                if last_logged_queue_size > 10:
                    number_queue_sizes = len(queue_sizes)
                    total_of_queue_sizes = sum(queue_sizes)
                    if total_of_queue_sizes > 0:
                        average_queue_size = total_of_queue_sizes / number_queue_sizes
                    else:
                        average_queue_size = 0
                    logger.info(
                        '%s :: total queue size for the last 10 seconds - %s' %
                        (skyline_app, str(total_of_queue_sizes)))
                    logger.info(
                        '%s :: total queue values known for the last 10 seconds - %s'
                        % (skyline_app, str(number_queue_sizes)))
                    logger.info(
                        '%s :: average queue size for the last 10 seconds - %s'
                        % (skyline_app, str(average_queue_size)))
                    # self.send_graphite_metric('skyline.horizon.' + SERVER_METRIC_PATH + 'queue_size', self.q.qsize())
                    # self.send_graphite_metric('queue_size', average_queue_size)
                    send_metric_name = '%s.queue_size' % skyline_app_graphite_namespace
                    send_graphite_metric(skyline_app, send_metric_name,
                                         average_queue_size)
                    # @added 20210416 - Task #4020: Add horizon.queue_size_60s_avg metric
                    for size_value in queue_sizes:
                        queue_sizes_60_seconds.append(size_value)
                    # reset queue_sizes
                    queue_sizes = []
                    # last_send_to_graphite = time()
                    last_log_queue_size = time()

                # @added 20210416 - Task #4020: Add horizon.queue_size_60s_avg metric
                # Moved from the 10 second block above to its own 60 seconds block
                last_sent_graphite = now - last_send_to_graphite
                if last_sent_graphite >= 60:
                    try:
                        number_queue_sizes = len(queue_sizes_60_seconds)
                        total_of_queue_sizes = sum(queue_sizes_60_seconds)
                        if total_of_queue_sizes > 0:
                            average_queue_size = total_of_queue_sizes / number_queue_sizes
                        else:
                            average_queue_size = 0
                        logger.info(
                            '%s :: total queue size for the last 60 seconds - %s'
                            % (skyline_app, str(total_of_queue_sizes)))
                        logger.info(
                            '%s :: total queue values known for the last 60 seconds - %s'
                            % (skyline_app, str(number_queue_sizes)))
                        logger.info(
                            '%s :: average queue size for the last 60 seconds - %s'
                            % (skyline_app, str(average_queue_size)))
                        send_metric_name = '%s.queue_size_60s_avg' % skyline_app_graphite_namespace
                        send_graphite_metric(skyline_app, send_metric_name,
                                             average_queue_size)
                        # reset queue_sizes_60_seconds and last_sent_graphite
                        queue_sizes_60_seconds = []
                        last_send_to_graphite = time()
                    except Exception as e:
                        logger.error(traceback.format_exc())
                        logger.error('%s :: error: %s' % (skyline_app, str(e)))

                    # @added 20210520 - Branch #1444: thunder
                    # Added Redis hash keys for thunder to monitor the horizon
                    # listen state, inferred via the
                    # horizon.worker.average_queue_size, without incurring the
                    # addition of Redis to horizon/listen.  This is used by
                    # Thunder notify on degradation of service and operational
                    # changes.
                    cache_key = 'horizon.worker.average_queue_size'
                    try:
                        self.redis_conn.hset(cache_key, 'value',
                                             int(average_queue_size))
                        self.redis_conn.hset(cache_key, 'timestamp', int(now))
                    except Exception as e:
                        logger.error(
                            'error :: horizon :: worker :: could not update the Redis %s key - %s'
                            % (cache_key, e))

            # @added 20200815 - Feature #3680: horizon.worker.datapoints_sent_to_redis
            # @modified 20201017 - Feature #3788: snab_flux_load_test
            #                      Feature #3680: horizon.worker.datapoints_sent_to_redis
            # Send for each worker
            last_datapoints_count_to_redis = now - last_datapoints_to_redis
            if last_datapoints_count_to_redis >= 60:
                logger.info(
                    '%s :: datapoints_sent_to_redis in last 60 seconds - %s' %
                    (skyline_app, str(datapoints_sent_to_redis)))
                if self.canary:
                    send_metric_name = '%s.datapoints_sent_to_redis' % (
                        skyline_app_graphite_namespace)
                else:
                    send_metric_name = '%s.datapoints_sent_to_redis_%s' % (
                        skyline_app_graphite_namespace, str(
                            self.worker_number))
                send_graphite_metric(skyline_app, send_metric_name,
                                     datapoints_sent_to_redis)
                datapoints_sent_to_redis = 0
                last_datapoints_to_redis = int(time())

                # @added 20201120 - Feature #3820: HORIZON_SHARDS
                if HORIZON_SHARDS:
                    try:
                        self.redis_conn.sadd(
                            'horizon.shards.metrics_assigned',
                            *set(horizon_shard_assigned_metrics))
                    except Exception as e:
                        logger.error(traceback.format_exc())
                        logger.error(
                            '%s :: error adding horizon.shard.metrics_assigned: %s'
                            % (skyline_app, str(e)))
                    try:
                        self.redis_conn.sadd(
                            'horizon.shard.metrics_dropped',
                            *set(horizon_shard_dropped_metrics))
                    except Exception as e:
                        logger.error(traceback.format_exc())
                        logger.error(
                            '%s :: error adding horizon.shard.metrics_dropped: %s'
                            % (skyline_app, str(e)))

                # @added 20201103 - Feature #3820: HORIZON_SHARDS
                if HORIZON_SHARDS and HORIZON_SHARD_DEBUG:
                    horizon_shard_assigned_metrics_count = len(
                        horizon_shard_assigned_metrics)
                    horizon_shard_dropped_metrics_count = len(
                        horizon_shard_dropped_metrics)
                    logger.info(
                        '%s :: %s assigned metrics to HORIZON_SHARD %s and dropped %s metrics in last 60 seconds'
                        % (skyline_app,
                           str(horizon_shard_assigned_metrics_count),
                           str(HORIZON_SHARD),
                           str(horizon_shard_dropped_metrics_count)))
                    if LOCAL_DEBUG:
                        logger.info(
                            '%s :: horizon_shard_assigned_metrics - %s' %
                            (skyline_app, str(horizon_shard_assigned_metrics)))
                        logger.info(
                            '%s :: horizon_shard_dropped_metrics - %s' %
                            (skyline_app, str(horizon_shard_dropped_metrics)))
                    horizon_shard_assigned_metrics = []
                    horizon_shard_dropped_metrics = []

                # @added 20201120 - Feature #3820: HORIZON_SHARDS
                try:
                    self.redis_conn.sadd('horizon.metrics_received',
                                         *set(metrics_received))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        '%s :: error adding horizon.metrics_received' %
                        (skyline_app))
                if self.canary:
                    try:
                        logger.info(
                            'renaming key horizon.metrics_received to aet.horizon.metrics_received'
                        )
                        self.redis_conn.rename('horizon.metrics_received',
                                               'aet.horizon.metrics_received')
                    except Exception as e:
                        logger.info(traceback.format_exc())
                        logger.error(
                            'error :: failed to rename Redis key horizon.metrics_received to aet.horizon.metrics_received - %s'
                            % e)

                    # @added 20210520 - Branch #1444: thunder
                    # Added Redis hash keys for thunder to monitor the horizon
                    # listen state, inferred via the
                    # horizon.worker.average_queue_size, without incurring the
                    # addition of Redis to horizon/listen.  This is used by
                    # Thunder notify on degradation of service and operational
                    # changes.
                    cache_key = 'horizon'
                    try:
                        self.redis_conn.setex(cache_key, 120, int(now))
                    except Exception as e:
                        logger.error(
                            'error :: horizon :: worker :: could not update the Redis %s key - %s'
                            % (cache_key, e))
                    metrics_received_count = 0
                    try:
                        metrics_received_count = self.redis_conn.scard(
                            'aet.horizon.metrics_received')
                    except Exception as e:
                        logger.error(traceback.format_exc())
                        logger.error(
                            '%s :: error running redis_conn.scard(\'aet.horizon.metrics_received\') - %s'
                            % (skyline_app, e))
                    cache_key = 'thunder.horizon.worker.metrics_received'
                    try:
                        self.redis_conn.hset(cache_key, 'value',
                                             int(metrics_received_count))
                        self.redis_conn.hset(cache_key, 'timestamp', int(now))
                    except Exception as e:
                        logger.error(
                            'error :: horizon :: worker :: could not update the Redis %s key - %s'
                            % (cache_key, e))
                    send_metric_name = '%s.metrics_received' % (
                        skyline_app_graphite_namespace)
                    send_graphite_metric(skyline_app, send_metric_name,
                                         metrics_received_count)

                metrics_received = []

                # @added 20220216 - Feature #4446: Optimise horizon worker in_skip_list
                if skip_metrics_list:
                    update_skip_metrics = {}
                    for metric in list(set(skip_metrics_list)):
                        update_entry = False
                        if skip_metrics_dict:
                            try:
                                # Only update skip metrics every hour to save on
                                # Redis overhead
                                last_update_timestamp_str = skip_metrics_dict[
                                    metric]
                                if last_update_timestamp_str:
                                    last_update_timestamp = int(
                                        float(last_update_timestamp_str))
                                    if (now - last_update_timestamp) >= 3600:
                                        update_entry = True
                            except KeyError:
                                update_entry = True
                            except:
                                update_entry = True
                        else:
                            update_entry = True
                        if update_entry:
                            update_skip_metrics[metric] = int(now)
                    if update_skip_metrics:
                        try:
                            self.redis_conn_decoded.hset(
                                'horizon.skip_metrics',
                                mapping=update_skip_metrics)
                        except Exception as err:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: listen :: failed to update horizon.skip_metrics Redis hash - %s'
                                % (err))
                if do_not_skip_metrics_list:
                    update_metrics = {}
                    for metric in list(set(do_not_skip_metrics_list)):
                        update_entry = False
                        if do_not_skip_metrics_dict:
                            try:
                                # Only update skip metrics every hour to save on
                                # Redis overhead
                                last_update_timestamp_str = skip_metrics_dict[
                                    metric]
                                if last_update_timestamp_str:
                                    last_update_timestamp = int(
                                        float(last_update_timestamp_str))
                                    if (now - last_update_timestamp) >= 3600:
                                        update_entry = True
                            except KeyError:
                                update_entry = True
                            except:
                                update_entry = True
                        else:
                            update_entry = True
                        if update_entry:
                            update_metrics[metric] = int(now)
                    if update_metrics:
                        try:
                            self.redis_conn_decoded.hset(
                                'horizon.do_not_skip_metrics',
                                mapping=update_metrics)
                        except Exception as err:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: listen :: failed to update horizon.do_not_skip_metrics Redis hash - %s'
                                % (err))
                try:
                    skip_metrics_dict = self.redis_conn_decoded.hgetall(
                        'horizon.skip_metrics')
                    logger.info(
                        '%s :: got %s metrics from horizon.skip_metrics from Redis hash on minutely run'
                        % (skyline_app, str(len(skip_metrics_dict))))
                except Exception as err:
                    logger.error(
                        '%s :: error on hgetall horizon.skip_metrics: %s' %
                        (skyline_app, str(err)))
                try:
                    do_not_skip_metrics_dict = self.redis_conn_decoded.hgetall(
                        'horizon.do_not_skip_metrics')
                    logger.info(
                        '%s :: got %s metrics from horizon.do_not_skip_metrics from Redis hash on minutely run'
                        % (skyline_app, str(len(do_not_skip_metrics_dict))))
                except Exception as err:
                    logger.error(
                        '%s :: error on hgetall horizon.do_not_skip_metrics: %s'
                        % (skyline_app, str(err)))
                logger.info('%s :: checked %s metrics through in_skip_list' %
                            (skyline_app, str(checked_for_skip_count)))
                skip_metrics_list = []
                do_not_skip_metrics_list = []
Exemple #5
0
    def run(self):
        """
        Called when the process intializes.
        """
        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        # In Vista the log management is handled be fetcher, the worker just
        # waits for the fetcher to do the log managment
        now = int(time())
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = int(time())
            else:
                now = log_wait_for + 1

        logger.info('worker :: starting log management')
        if os.path.isfile(skyline_app_loglock):
            logger.error(
                'error :: worker :: bin/%s.d log management seems to have failed, continuing'
                % skyline_app)
            try:
                os_remove(skyline_app_loglock)
                logger.info('worker :: log lock file removed')
            except OSError:
                logger.error(
                    'error :: worker :: failed to remove %s, continuing' %
                    skyline_app_loglock)
                pass
        else:
            logger.info('worker :: bin/%s.d log management done' % skyline_app)

        logger.info('worker :: starting worker')

        try:
            VISTA_ENABLED = settings.VISTA_ENABLED
            logger.info('worker :: VISTA_ENABLED is set to %s' %
                        str(VISTA_ENABLED))
        except:
            VISTA_ENABLED = False
            logger.info(
                'worker :: warning :: VISTA_ENABLED is not declared in settings.py, defaults to False'
            )

        last_sent_to_graphite = int(time())
        metrics_sent_to_flux = 0

        # python-2.x and python3.x handle while 1 and while True differently
        # while 1:
        running = True
        while running:

            # Make sure Redis is up
            redis_up = False
            while not redis_up:
                try:
                    redis_up = self.redis_conn.ping()
                    if LOCAL_DEBUG:
                        logger.info('worker :: redis is up')
                except:
                    logger.error(
                        'worker :: cannot connect to redis at socket path %s' %
                        (settings.REDIS_SOCKET_PATH))
                    sleep(2)

                    # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings
                    #                      Branch #3262: py3
                    # if settings.REDIS_PASSWORD:
                    #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
                    # else:
                    #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                    self.redis_conn = get_redis_conn(skyline_app)
                    self.redis_conn_decoded = get_redis_conn_decoded(
                        skyline_app)

            metrics_data = []
            redis_set = 'vista.fetcher.metrics.json'
            try:
                # Get a metric to validate from the Redis set

                # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings
                #                      Branch #3262: py3
                # metrics_data = self.redis_conn.smembers(redis_set)
                metrics_data = self.redis_conn_decoded.smembers(redis_set)

                if LOCAL_DEBUG:
                    logger.info('worker :: got redis set data - %s' %
                                redis_set)
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: worker :: retrieving Redis set %s data' %
                    str(redis_set))

            if not metrics_data:
                if LOCAL_DEBUG:
                    logger.info('worker :: no data from Redis set %s' %
                                str(redis_set))
                sleep(5)

            for str_metric_data in metrics_data:
                delete_set_record = False
                remote_host_type = None
                try:

                    # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings
                    #                      Branch #3262: py3
                    # Rather using get_redis_conn_decoded
                    # if python_version == 3:
                    #     str_metric_data = str_metric_data.decode('UTF-8')

                    metric_data = literal_eval(str_metric_data)
                    remote_host_type = str(metric_data[0]['remote_host_type'])
                    if LOCAL_DEBUG:
                        logger.info(
                            'worker :: got data from Redis set for remote_host_type %s'
                            % str(remote_host_type))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to determine remote_host_type from %s'
                        % str(str_metric_data))
                    delete_set_record = True
                if not delete_set_record:
                    try:
                        remote_target = str(metric_data[0]['remote_target'])
                        if LOCAL_DEBUG:
                            logger.info(
                                'worker :: got data from Redis set for target %s'
                                % str(remote_target))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to determine target from %s'
                            % str(str_metric_data))
                        delete_set_record = True
                metric = None
                if not delete_set_record:
                    try:
                        metric = str(metric_data[0]['metric'])
                        if LOCAL_DEBUG:
                            logger.info(
                                'worker :: got data from Redis set for metric %s'
                                % str(metric))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to determine metric from %s'
                            % str(str_metric_data))
                        delete_set_record = True

                namespace_prefix = ''
                if not delete_set_record:
                    try:
                        namespace_prefix = str(
                            metric_data[0]['namespace_prefix'])
                        namespace_prefix = '%s.' % namespace_prefix
                        if not namespace_prefix:
                            namespace_prefix = ''
                        if LOCAL_DEBUG:
                            logger.info(
                                'worker :: got data from Redis set for namespace_prefix %s'
                                % str(namespace_prefix))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to determine namespace_prefix from %s'
                            % str(str_metric_data))
                        delete_set_record = True

                have_data = False
                if not delete_set_record:
                    last_flux_metric_data = None
                    cache_key = 'flux.last.%s' % (metric)
                    try:
                        if python_version == 3:
                            redis_last_flux_metric_data = self.redis_conn.get(
                                cache_key).decode('UTF-8')
                        else:
                            redis_last_flux_metric_data = self.redis_conn.get(
                                cache_key)
                        redis_last_flux_metric_data = redis_last_flux_metric_data
                        last_flux_metric_data = literal_eval(
                            redis_last_flux_metric_data)
                        if LOCAL_DEBUG:
                            logger.info(
                                'worker :: got last_flux_metric_data from Redis'
                            )
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: retrieving Redis key %s data' %
                            str(cache_key))
                        last_flux_metric_data = False

                    last_flux_timestamp = None
                    if last_flux_metric_data:
                        try:
                            last_flux_timestamp = int(last_flux_metric_data[0])
                            if LOCAL_DEBUG:
                                logger.info(
                                    'worker :: got last_flux_timestamp - %s' %
                                    str(last_flux_timestamp))
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed determining last_flux_timestamp'
                            )
                            last_flux_timestamp = False

                    # Determine the timestamp of the current minute to apply
                    # VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE
                    time_now = int(time())
                    # current_minute = datetime.datetime.utcfromtimestamp(time_now).strftime('%Y-%m-%d %H:%M')
                    current_minute_hour = int(
                        datetime.datetime.utcfromtimestamp(time_now).strftime(
                            '%H'))
                    current_minute_minute = int(
                        datetime.datetime.utcfromtimestamp(time_now).strftime(
                            '%M'))
                    current_datetime = datetime.datetime.utcfromtimestamp(
                        time_now).replace(hour=current_minute_hour,
                                          minute=current_minute_minute,
                                          second=0,
                                          microsecond=0)
                    current_minute_timestamp_start = int(
                        current_datetime.strftime('%s'))

                    datapoint = None
                    last_timestamp_with_data = None
                    timeseries = []

                    # @added 20200107 - Task #3376: Enable vista and flux to deal with lower frequency data
                    metric_resolution = 60
                    metric_resolution_determined = False

                    try:
                        if python_version == 3:
                            datapoints_str = literal_eval(
                                metric_data[0]['datapoints'])
                            metric_datapoints = literal_eval(datapoints_str)
                        else:
                            # metric_datapoints = metric_data[0]['datapoints']
                            datapoints_str = literal_eval(
                                metric_data[0]['datapoints'])
                            metric_datapoints = literal_eval(datapoints_str)
                        # for value, timestamp in metric_data[0]['datapoints']:
                        if LOCAL_DEBUG:
                            len_metric_datapoints = len(metric_datapoints)
                            logger.info(
                                'worker :: got %s metric_datapoints - %s' %
                                (str(len_metric_datapoints),
                                 str(metric_datapoints)))

                        # @added 20200107 - Task #3376: Enable vista and flux to deal with lower frequency data
                        # Determine resolution
                        resolution_timestamps = []
                        for metric_datapoint in metric_datapoints:
                            timestamp = int(metric_datapoint[0])
                            resolution_timestamps.append(timestamp)
                        timestamp_resolutions = []
                        if resolution_timestamps:
                            last_timestamp = None
                            for timestamp in resolution_timestamps:
                                if last_timestamp:
                                    resolution = timestamp - last_timestamp
                                    timestamp_resolutions.append(resolution)
                                    last_timestamp = timestamp
                                else:
                                    last_timestamp = timestamp
                        if timestamp_resolutions:
                            try:
                                timestamp_resolutions_count = Counter(
                                    timestamp_resolutions)
                                ordered_timestamp_resolutions_count = timestamp_resolutions_count.most_common(
                                )
                                metric_resolution = int(
                                    ordered_timestamp_resolutions_count[0][0])
                                if metric_resolution > 0:
                                    metric_resolution_determined = True
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: worker :: failed to determine metric_resolution from %s'
                                    % (str(metric_data)))
                        if metric_resolution_determined:
                            cache_key = 'vista.last.resolution.%s' % metric
                            try:
                                # Update Redis key
                                self.redis_conn.setex(cache_key, 3600,
                                                      metric_resolution)
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: fetcher :: failed to set Redis key - %s'
                                    % (cache_key))

                        for metric_datapoint in metric_datapoints:
                            # @20191010 - Branch #3140: vista
                            # fetcher passes through preformatted data points that
                            # are in the same format/order for both graphite and
                            # prometheus
                            # if remote_host_type == 'graphite':
                            #     value = float(metric_datapoint[0])
                            #     timestamp = int(metric_datapoint[1])
                            # if remote_host_type == 'prometheus':
                            #     value = float(metric_datapoint[1])
                            #     timestamp = int(metric_datapoint[0])
                            timestamp = int(metric_datapoint[0])
                            value = float(metric_datapoint[1])

                            append_to_timeseries = False
                            if last_flux_timestamp:
                                if int(timestamp) > last_flux_timestamp:
                                    # timeseries.append([timestamp, value])
                                    append_to_timeseries = True
                            else:
                                # timeseries.append([timestamp, value])
                                append_to_timeseries = True

                            # Here if the timestamp of the data point falls
                            # within the current minute, it is discarded and not
                            # sent to flux, to ensure that high frequency metrics
                            # can have their minutely bins fully populated before
                            # they are submitted to Graphite
                            if settings.VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE:
                                if int(timestamp
                                       ) >= current_minute_timestamp_start:
                                    append_to_timeseries = False
                            if append_to_timeseries:
                                timeseries.append([timestamp, value])

                        last_timestamp_with_data = 0
                        for timestamp, value in timeseries[::-1]:
                            has_value = False
                            if value == 0.0:
                                has_value = True
                            if value:
                                has_value = True
                            if has_value:
                                last_timestamp_with_data = int(timestamp)
                                datapoint = value
                                break
                        if last_timestamp_with_data:
                            have_data = True
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to determine datapoints from %s'
                            % (str(metric_data)))
                        delete_set_record = True
                if not timeseries:
                    logger.info(
                        'worker :: after processing, there were no valid data points in %s'
                        % (str(metric_data)))
                    delete_set_record = True
                if not have_data and timeseries:
                    logger.error(
                        'error :: worker :: failed to determine last_timestamp_with_data from %s'
                        % (str(metric_data)))
                    delete_set_record = True
                if delete_set_record:
                    try:
                        redis_set = 'vista.fetcher.metrics.json'
                        self.redis_conn.srem(redis_set, str_metric_data)
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to delete data from Redis set %s, data - '
                            % (str(redis_set), str(str_metric_data)))
                    continue

                if not metric:
                    continue

                valid_data = True
                if last_flux_timestamp and last_timestamp_with_data:
                    if int(last_timestamp_with_data) <= last_flux_timestamp:
                        valid_data = False
                if not valid_data:
                    redis_set = 'vista.fetcher.metrics.json'
                    logger.info(
                        'worker :: no valid data in fetched data removing from Redis set %s - data - %s'
                        % (redis_set, str(str_metric_data)))
                    try:
                        self.redis_conn.srem(redis_set, str_metric_data)
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to delete data from Redis set %s, data - %s'
                            % (redis_set, str(str_metric_data)))
                    continue

                if valid_data:
                    flux_host = 'http://%s:%s' % (settings.FLUX_IP,
                                                  settings.FLUX_PORT)

                    # Resample
                    resample_at = None
                    if resample_at == 'none' or resample_at == '0Min':
                        resample_at = False
                    if resample_at == 'None' or resample_at == '0min':
                        resample_at = False
                    if resample_at is None or resample_at == '0' or resample_at == 0:
                        resample_at = False
                    if resample_at:
                        try:
                            df = pd.DataFrame(timeseries)
                            df.columns = ['timestamp', 'value']
                            df['timestamp'] = pd.to_datetime(df['timestamp'],
                                                             unit='s',
                                                             origin='unix')
                            df = df.set_index('timestamp')
                            resampled_df = df.resample(resample_at).sum()
                            resampled_timeseries = []
                            for index, row in resampled_df.iterrows():
                                timestamp = int(index.strftime('%s'))
                                resampled_timeseries.append(
                                    [timestamp, row[0]])
                            timeseries = resampled_timeseries
                            timeseries_length = len(timeseries)
                            logger.info(
                                'worker :: time series resampled at %s resulting in %s data points to send to Graphite'
                                % (str(resample_at), str(timeseries_length)))
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to resample time series at %s for %s with time series %s'
                                % (str(resample_at), str(metric),
                                   str(timeseries)))

                    for timestamp, value in timeseries:
                        flux_url = '%s/metric_data?metric=%s&value=%s&timestamp=%s&key=%s' % (
                            flux_host, metric, str(datapoint), str(timestamp),
                            settings.FLUX_SELF_API_KEY)
                        success = False
                        try:
                            response = requests.get(flux_url)
                            if response.status_code == 200:
                                success = True
                            elif response.status_code == 204:
                                success = True
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to request %s' %
                                str(flux_url))
                        if not success:
                            logger.error(
                                'error :: worker :: http status code - %s, reason - %s'
                                % (str(response.status_code),
                                   str(response.reason)))

                    if success:
                        metrics_sent_to_flux += 1
                        redis_set = 'vista.fetcher.metrics.json'

                        # @added 20191011 - Task #3258: Reduce vista logging
                        timeseries_length = len(timeseries)

                        # @modified 20191011 - Task #3258: Reduce vista logging
                        # logger.info('worker :: data submitted to flux OK, removing data from Redis set %s' % (
                        #     redis_set))
                        logger.info(
                            'worker :: %s data points submitted to flux OK for %s'
                            % (str(timeseries_length), metric))
                        try:
                            self.redis_conn.srem(redis_set, str_metric_data)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to delete data from Redis set %s, data - %s'
                                % (redis_set, str(str_metric_data)))

                        redis_set = 'vista.fetcher.unique_metrics'
                        try:
                            self.redis_conn.sadd(redis_set, remote_target)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to add %s to Redis set %s'
                                % (remote_target, redis_set))

            time_now = int(time())
            if (time_now - last_sent_to_graphite) >= 60:
                logger.info(
                    'worker :: metrics sent_to_flux in last 60 seconds - %s' %
                    str(metrics_sent_to_flux))
                send_metric_name = '%s.metrics_sent_to_flux' % skyline_app_graphite_namespace
                try:
                    send_graphite_metric(parent_skyline_app, send_metric_name,
                                         str(metrics_sent_to_flux))
                    last_sent_to_graphite = int(time())
                    metrics_sent_to_flux = 0
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to send_graphite_metric %s with %s'
                        % (send_metric_name, str(metrics_sent_to_flux)))
Exemple #6
0
    def run(self):
        """
        Called when the process intializes.
        """
        def pickle_data_to_graphite(data):

            message = None
            try:
                payload = pickle.dumps(data, protocol=2)
                header = struct.pack("!L", len(payload))
                message = header + payload
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: populate_metric_worker :: failed to pickle to send to Graphite'
                )
                return False
            if message:
                try:
                    sock = socket.socket()
                    sock.connect((CARBON_HOST, FLUX_CARBON_PICKLE_PORT))
                    sock.sendall(message)
                    sock.close()
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to send pickle data to Graphite'
                    )
                    return False
            else:
                logger.error(
                    'error :: populate_metric_worker :: failed to pickle metric data into message'
                )
                return False
            return True

        logger.info('populate_metric_worker :: starting worker')

        # Populate API keys and tokens in memcache
        # python-2.x and python3.x handle while 1 and while True differently
        # while 1:
        running = True
        while running:
            # Make sure Redis is up
            redis_up = False
            while not redis_up:
                try:
                    redis_up = self.redis_conn.ping()
                except:
                    logger.error(
                        'populate_metric_worker :: cannot connect to Redis at socket path %s'
                        % (settings.REDIS_SOCKET_PATH))
                    sleep(2)

                    # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings
                    #                      Branch #3262: py3
                    # if settings.REDIS_PASSWORD:
                    #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
                    # else:
                    #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                    self.redis_conn = get_redis_conn(skyline_app)
                    # @added 20191128 - Bug #3266: py3 Redis binary objects not strings
                    #                   Branch #3262: py3
                    self.redis_conn_decoded = get_redis_conn_decoded(
                        skyline_app)

            metricDict = None
            try:
                # Get a metric from the queue with a 1 second timeout, each
                # metric item on the queue is a list e.g.
                # metric_json = [metricName, metricValue, metricTimestamp]
                metricDict = self.q.get(True, 1)
                logger.info('populate_metric_worker :: processing queue item')
            except Empty:
                logger.info(
                    'populate_metric_worker :: queue is empty and timed out, sleeping for 30 seconds'
                )
                sleep(30)
            except NotImplementedError:
                pass
            except KeyboardInterrupt:
                logger.info(
                    'populate_metric_worker :: server has been issued a user signal to terminate - KeyboardInterrupt'
                )
            except SystemExit:
                logger.info(
                    'populate_metric_worker :: server was interrupted - SystemExit'
                )
            except Exception as e:
                logger.error('error :: populate_metric_worker :: %s' %
                             (str(e)))

            if not metricDict:
                continue

            try:
                remote_host_type = str(metricDict['remote_host_type'])
                remote_target = str(metricDict['remote_target'])
                metric = str(metricDict['metric'])
                namespace_prefix = str(metricDict['namespace_prefix'])
                if not namespace_prefix:
                    namespace_prefix = ''
                if namespace_prefix == 'None':
                    namespace_prefix = ''
                key = str(metricDict['key'])
                token = str(metricDict['token'])
                user = str(metricDict['user'])
                password = str(metricDict['password'])
                if metricDict['fetch_resolution_urls'] == 'None':
                    logger.info(
                        'No fetch_resolution_urls declared for %s, nothing to do'
                        % remote_target)
                    continue
                if metricDict['fetch_resolution_urls'] == '()' or metricDict[
                        'fetch_resolution_urls'] == ():
                    logger.info(
                        'No fetch_resolution_urls declared for %s, nothing to do'
                        % remote_target)
                    continue

                fetch_resolution_urls_str = literal_eval(
                    metricDict['fetch_resolution_urls'])
                fetch_resolution_urls = literal_eval(fetch_resolution_urls_str)
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: populate_metric_worker :: failed to read from metricData'
                )

            if LOCAL_DEBUG:
                try:
                    logger.info(
                        'populate_metric_worker :: remote_target from metricData set to %s'
                        % remote_target)
                    logger.info(
                        'populate_metric_worker :: metric from metricData set to %s'
                        % metric)
                    logger.info(
                        'populate_metric_worker :: namespace_prefix from metricData set to %s'
                        % namespace_prefix)
                    logger.info(
                        'populate_metric_worker :: key from metricData set to %s'
                        % key)
                    logger.info(
                        'populate_metric_worker :: token from metricData set to %s'
                        % token)
                    logger.info(
                        'populate_metric_worker :: user from metricData set to %s'
                        % user)
                    logger.info(
                        'populate_metric_worker :: password from metricData set to %s'
                        % password)
                    logger.info(
                        'populate_metric_worker :: fetch_resolution_urls from metricData set to %s'
                        % str(fetch_resolution_urls))
                    if fetch_resolution_urls:
                        for fetch_url in fetch_resolution_urls:
                            logger.info(
                                'populate_metric_worker :: a fetch_url from metricData is set to %s'
                                % str(fetch_url))
                    logger.info(
                        'populate_metric_worker :: metric is set to %s' %
                        metric)
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to read from metricData'
                    )

            # Best effort to de-duplicate the data sent to Graphite
            cache_key = 'flux.last.%s' % metric
            last_flux_timestamp = None
            try:
                # @modified 20191128 - Bug #3266: py3 Redis binary objects not strings
                #                      Branch #3262: py3
                # redis_last_metric_data = self.redis_conn.get(cache_key).decode('utf-8')
                redis_last_metric_data = self.redis_conn_decoded.get(cache_key)
                last_metric_data = literal_eval(redis_last_metric_data)
                last_flux_timestamp = int(last_metric_data[0])
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: populate_metric_worker :: failed to determine last_flux_timestamp from Redis key %s'
                    % cache_key)
                last_flux_timestamp = False
            recent_last_flux_timestamp_present = False
            if last_flux_timestamp:
                now = int(time())
                if (now - last_flux_timestamp) < 600:
                    recent_last_flux_timestamp_present = True
                    # Skyline has the metric so adding it to the vista.fetcher
                    # Redis set
                    redis_set = 'vista.fetcher.unique_metrics'
                    data = str(remote_target)
                    try:
                        self.redis_conn.sadd(redis_set, data)
                        logger.info(
                            'populate_metric_worker :: the last flux update for %s was less than 600 seconds ago, added metric to %s'
                            % (metric, redis_set))
                    except:
                        logger.info(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: failed to add %s to Redis set %s'
                            % (str(data), str(redis_set)))
                    # continue

            if not last_flux_timestamp:
                # Check Graphite does not have the data or determine what the
                # last data Graphite has is
                logger.info(
                    'populate_metric_worker :: no last_flux_timestamp was found in Redis for %s, checking if Graphite has data'
                    % (metric))
                check_graphite_from = [
                    '-50mins', '-6hours', '-24hours', '-7days', '-30days',
                    '-90days'
                ]
                timeseries = []
                for graphite_from in check_graphite_from:
                    if last_flux_timestamp:
                        break
                    logger.info(
                        'populate_metric_worker :: checking %s in Graphite from %s'
                        % (metric, graphite_from))
                    got_data = False
                    try:
                        # We use absolute time so that if there is a lag in mirage the correct
                        # timeseries data is still surfaced relevant to the anomalous datapoint
                        # timestamp
                        if settings.GRAPHITE_PORT != '':
                            url = '%s://%s:%s/%s/?from=%s&target=%s&format=json' % (
                                settings.GRAPHITE_PROTOCOL,
                                settings.GRAPHITE_HOST,
                                str(settings.GRAPHITE_PORT),
                                settings.GRAPHITE_RENDER_URI, graphite_from,
                                metric)
                        else:
                            url = '%s://%s/%s/?from=%s&target=%s&format=json' % (
                                settings.GRAPHITE_PROTOCOL,
                                settings.GRAPHITE_HOST,
                                settings.GRAPHITE_RENDER_URI, graphite_from,
                                metric)
                        logger.info(
                            'populate_metric_worker :: using Graphite URL - %s'
                            % (url))
                        r = requests.get(url)
                        if r.status_code == 200:
                            js = []
                            try:
                                js = r.json()
                            except:
                                logger.info(traceback.format_exc())
                                logger.error(
                                    'error :: populate_metric_worker :: failed to get data from Graphite'
                                )
                                continue
                            if not js:
                                logger.info(
                                    'populate_metric_worker :: %s not present in Graphite from %s'
                                    % (metric, graphite_from))
                                continue
                            got_data = True
                            logger.info(
                                'populate_metric_worker :: %s present in Graphite from %s'
                                % (metric, graphite_from))
                        else:
                            logger.info(
                                'populate_metric_worker :: %s not present in Graphite from %s'
                                % (metric, graphite_from))
                            continue
                    except:
                        logger.info(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: failed to get data from Graphite'
                        )
                        continue

                    datapoints = []
                    if got_data:
                        try:
                            js = r.json()
                            datapoints = js[0]['datapoints']
                            logger.info(
                                'populate_metric_worker :: %s data points are present in the Graphite %s data'
                                % (str(len(datapoints)), str(graphite_from)))
                        except:
                            logger.info(traceback.format_exc())
                            logger.error(
                                'error :: populate_metric_worker :: failed to get data from Graphite'
                            )

                    for datapoint in datapoints:
                        try:
                            value = float(datapoint[0])
                            timestamp = int(datapoint[1])
                            new_datapoint = [timestamp, value]
                            timeseries.append(new_datapoint)
                        except:  # nosec
                            continue
                    last_timestamp_with_data = None
                    for timestamp, value in timeseries[::-1]:
                        has_value = False
                        if value == 0.0:
                            has_value = True
                        if value == 0:
                            has_value = True
                        if value:
                            has_value = True
                        if has_value:
                            last_timestamp_with_data = int(timestamp)
                            datapoint = value
                            break
                    if last_timestamp_with_data:
                        # Here we set this as the missing last_flux_timestamp
                        last_flux_timestamp = last_timestamp_with_data
                        recent_last_flux_timestamp_present = True
                        logger.info(
                            'populate_metric_worker :: %s last timestamp in Graphite from %s is %s, using as last_flux_timestamp'
                            % (metric, str(graphite_from),
                               str(last_flux_timestamp)))

            timeseries = []
            start_populating = int(time())
            datapoints_added_to_timeseries = 0
            datapoints_already_populated = 0
            datapoints_with_no_value = 0
            timestamp = None
            value = None

            # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints
            # And set flux.last key is the returned value from the remote is
            # null so that time series that are mostly null do not keep on
            # getting added to flux populate_metric by Vista
            raw_timeseries = []

            for fetch_url in fetch_resolution_urls:
                # if recent_last_flux_timestamp_present and remote_host_type == 'prometheus':
                # This was for the query query and resample method and not for
                # the query_range query
                if recent_last_flux_timestamp_present and remote_host_type == 'prometheus_query_range_NOT_FOR_GE_11000':
                    try:
                        logger.info(
                            'populate_metric_worker :: recent data so replacing fetch_url %s '
                            % (fetch_url))
                        seconds_to_fetch = int(time()) - last_flux_timestamp
                        minutes_to_fetch = int(seconds_to_fetch / 60) + 2
                        re_mins_to_fetch = '[%sm]' % str(minutes_to_fetch)
                        fetch_url = re.sub(r'\[.*\]', re_mins_to_fetch,
                                           fetch_url)
                        encoded_re_mins_to_fetch = '%%5B%sm%%5D' % str(
                            minutes_to_fetch)
                        fetch_url = re.sub(r'%5B.*%5D',
                                           encoded_re_mins_to_fetch, fetch_url)
                        logger.info(
                            'populate_metric_worker :: replaced fetch_url %s '
                            % (fetch_url))
                    except:
                        logger.info(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: failed to rewrite URL'
                        )

                if recent_last_flux_timestamp_present and remote_host_type == 'prometheus':
                    try:
                        logger.info(
                            'populate_metric_worker :: recent data so replacing fetch_url %s '
                            % (fetch_url))
                        seconds_to_fetch = int(time()) - last_flux_timestamp
                        minutes_to_fetch = int(seconds_to_fetch / 60) + 2
                        re_mins_to_fetch = '[%sm]' % str(minutes_to_fetch)
                        fetch_url = re.sub(r'\[.*\]', re_mins_to_fetch,
                                           fetch_url)
                        encoded_re_mins_to_fetch = '%%5B%sm%%5D' % str(
                            minutes_to_fetch)
                        fetch_url = re.sub(r'%5B.*%5D',
                                           encoded_re_mins_to_fetch, fetch_url)
                        logger.info(
                            'populate_metric_worker :: replaced fetch_url %s '
                            % (fetch_url))
                    except:
                        logger.info(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: failed to rewrite URL'
                        )

                success = False
                try:
                    logger.info(
                        'populate_metric_worker :: getting data from %s' %
                        str(fetch_url))
                    response = requests.get(fetch_url)
                    if response.status_code == 200:
                        success = True
                except:
                    logger.info(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: http status code - %s, reason - %s'
                        % (str(response.status_code), str(response.reason)))
                    logger.error(
                        'error :: populate_metric_worker :: failed to get data from %s'
                        % str(fetch_url))

                if not success:
                    continue

                datapoints = None
                try:
                    js = response.json()
                    if remote_host_type == 'graphite':
                        datapoints = js[0]['datapoints']
                    if remote_host_type == 'prometheus':
                        datapoints = js['data']['result'][0]['values']
                    datapoints_fetched = len(datapoints)
                    logger.info(
                        'populate_metric_worker :: retrieved %s data points from %s'
                        % (str(datapoints_fetched), str(fetch_url)))
                except:
                    logger.info(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to get data from %s'
                        % str(fetch_url))

                # Example
                # datapoints[0]
                # [7.3, 1556817000]
                # Add each data point and timestamp to the timeseries list so
                # they can be sent to Graphite
                if not datapoints:
                    logger.info(
                        'populate_metric_worker :: failed to get any data from %s'
                        % str(fetch_url))
                    continue

                # @added 20191108 - Bug #3312: flux - populate_metric_worker - handle None in datapoints
                valid_datapoints = []
                for datapoint in datapoints:
                    value = None
                    timestamp = None
                    if remote_host_type == 'graphite':
                        # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints
                        raw_timeseries.append([datapoint[1], datapoint[0]])

                        try:
                            raw_value = datapoint[0]
                            if raw_value is None:
                                datapoints_with_no_value += 1
                                continue
                            value = float(datapoint[0])
                            timestamp = int(datapoint[1])
                            valid_datapoints.append([value, timestamp])
                        except:
                            continue
                    if remote_host_type == 'prometheus':
                        # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints
                        raw_timeseries.append([datapoint[0], datapoint[1]])

                        try:
                            raw_value = datapoint[1]
                            if raw_value is None:
                                datapoints_with_no_value += 1
                                continue
                            timestamp = int(datapoint[0])
                            value = float(datapoint[1])
                        except:
                            continue
                        valid_datapoints.append([timestamp, value])
                datapoints = valid_datapoints

                # Order the time series by timestamp as the tuple can shift
                # order resulting in more recent data being added before older
                # data
                datapoints.sort()

                # Determine the timestamp of the current minute to apply
                # VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE
                time_now = int(time())
                current_minute_hour = int(
                    datetime.datetime.utcfromtimestamp(time_now).strftime(
                        '%H'))
                current_minute_minute = int(
                    datetime.datetime.utcfromtimestamp(time_now).strftime(
                        '%M'))
                current_datetime = datetime.datetime.utcfromtimestamp(
                    time_now).replace(hour=current_minute_hour,
                                      minute=current_minute_minute,
                                      second=0,
                                      microsecond=0)
                current_minute_timestamp_start = int(
                    current_datetime.strftime('%s'))
                datapoints_in_current_minute = 0

                last_error = None
                value = None
                timestamp = None
                for datapoint in datapoints:
                    try:
                        if remote_host_type == 'graphite':
                            try:
                                raw_value = datapoint[0]
                                if raw_value is None:
                                    continue
                                value = float(datapoint[0])
                                timestamp = int(datapoint[1])
                            except:
                                continue
                        if remote_host_type == 'prometheus':
                            # timestamp = int(datapoint[0])
                            try:
                                timestamp = int(datapoint[0])
                                value = float(datapoint[1])
                            except:
                                continue
                        submit_data = True
                        if last_flux_timestamp:
                            if timestamp <= last_flux_timestamp:
                                submit_data = False
                                datapoints_already_populated += 1

                        # Here if the timestamp of the data point falls
                        # within the current minute, it is discarded and not
                        # sent to flux, to ensure that high frequency metrics
                        # can have their minutely bins fully populated before
                        # they are submitted to Graphite
                        if settings.VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE:
                            if timestamp >= current_minute_timestamp_start:
                                submit_data = False
                                datapoints_in_current_minute += 1
                        if submit_data:
                            new_datapoint = [timestamp, value]
                            timeseries.append(new_datapoint)
                            datapoints_added_to_timeseries += 1
                    # nosec to exclude from bandit tests
                    except:  # nosec
                        last_error = traceback.format_exc()
                        datapoints_with_no_value += 1
                        continue

                if last_error:
                    logger.error(last_error)
                    logger.error(
                        'error :: populate_metric_worker :: the above is the last_error encountered processing %s'
                        % (str(metric)))
                if datapoints_with_no_value:
                    logger.info(
                        'populate_metric_worker :: %s of the fetched records were discarded as they had value None'
                        % (str(datapoints_with_no_value)))
                if datapoints_in_current_minute:
                    logger.info(
                        'populate_metric_worker :: %s of the fetched records were discarded as they fall within the current minute'
                        % (str(datapoints_in_current_minute)))
                logger.info(
                    'populate_metric_worker :: %s of the fetched data points are older than the last known flux timestamp'
                    % (str(datapoints_already_populated)))
                logger.info(
                    'populate_metric_worker :: added %s data points to the time series to submit to Graphite'
                    % (str(datapoints_added_to_timeseries)))

            end_fecthing = int(time())
            seconds_to_fetch = end_fecthing - start_populating
            if timestamp:
                logger.info(
                    'populate_metric_worker :: last fetched value - %s, timestamp %s'
                    % (str(value), str(timestamp)))
            logger.info(
                'populate_metric_worker :: %s data point fecthed for %s in %s seconds'
                % (str(datapoints_added_to_timeseries), remote_target,
                   str(seconds_to_fetch)))

            # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints
            # And set flux.last key is the returned value from the remote is
            # null so that time series that are mostly null do not keep on
            # getting added to flux populate_metric by Vista
            if not timeseries:
                set_flux_key = False
                try:
                    sorted_raw_timeseries = sorted(raw_timeseries,
                                                   key=lambda x: x[0])
                    last_ts = sorted_raw_timeseries[-1][0]
                    if int(last_ts) > (end_fecthing - 120):
                        if sorted_raw_timeseries[-1][1] is None:
                            set_flux_key = True
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to determine if last value was null'
                    )
                if set_flux_key:
                    try:
                        # Update Redis flux key
                        cache_key = 'flux.last.%s' % metric
                        metric_data = [int(last_ts), None]
                        self.redis_conn.set(cache_key, str(metric_data))
                        logger.info(
                            'populate_metric_worker :: even though no data points so as to not loop round on this metric, set the metric Redis key - %s - %s'
                            % (cache_key, str(metric_data)))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: even though no data points, failed to set Redis key - %s - %s'
                            % (cache_key, str(metric_data)))
                    # Adding to the vista.fetcher.unique_metrics Redis set
                    redis_set = 'vista.fetcher.unique_metrics'
                    data = str(remote_target)
                    try:
                        self.redis_conn.sadd(redis_set, data)
                        logger.info(
                            'populate_metric_worker :: even though no data points, added %s to Redis set %s'
                            % (remote_target, redis_set))
                    except:
                        logger.info(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: even though no data points, failed to add %s to Redis set %s'
                            % (str(data), str(redis_set)))

            if not timeseries:
                logger.info(
                    'populate_metric_worker :: no data in the timeseries list for the time series for %s'
                    % metric)
                continue

            # Order the time series by timestamp as the tuple can shift
            # order resulting in more recent data being added before older
            # data
            timeseries.sort()
            timeseries_length = len(timeseries)

            # Resample
            resample_at = '1Min'
            if resample_at:
                try:
                    df = pd.DataFrame(timeseries)
                    df.columns = ['timestamp', 'value']
                    df['timestamp'] = pd.to_datetime(df['timestamp'],
                                                     unit='s',
                                                     origin='unix')
                    df = df.set_index('timestamp')
                    # resampled_df = df.resample(resample_at).sum()
                    # Use the mean as Prometheus uses the average in the
                    # query_range API method
                    resampled_df = df.resample(resample_at).mean()
                    resampled_timeseries = []
                    for index, row in resampled_df.iterrows():
                        timestamp = int(index.strftime('%s'))
                        resampled_timeseries.append([timestamp, row[0]])
                    timeseries = resampled_timeseries
                    timeseries_length = len(timeseries)
                    logger.info(
                        'populate_metric_worker :: time series resampled at %s resulting in %s data points to send to Graphite'
                        % (str(resample_at), str(timeseries_length)))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to resample time series for %s'
                        % str(metric))

            logger.info(
                'populate_metric_worker :: %s data points to send to Graphite'
                % (str(timeseries_length)))
            timestamp = None
            value = None
            sent_to_graphite = 0

            # use_pickle = False
            use_pickle = True
            if not use_pickle:
                for timestamp, value in timeseries:
                    try:
                        graphyte.send(metric, float(value), int(timestamp))
                        sent_to_graphite += 1
                        if sent_to_graphite % 1000 == 0:
                            logger.info(
                                'populate_metric_worker :: submitted %s of %s data points to Graphite so far'
                                % (str(sent_to_graphite),
                                   str(timeseries_length)))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: populate_metric_worker :: failed to send metric data to Graphite for %s'
                            % str(metric))
            else:
                listOfMetricTuples = []
                try:
                    for timestamp, value in timeseries:
                        tuple_data = (metric, (int(timestamp), float(value)))
                        listOfMetricTuples.append(tuple_data)
                        sent_to_graphite += 1
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to populate listOfMetricTuples for %s'
                        % str(metric))
                if listOfMetricTuples:
                    data_points_sent = 0
                    smallListOfMetricTuples = []
                    tuples_added = 0
                    for data in listOfMetricTuples:
                        smallListOfMetricTuples.append(data)
                        tuples_added += 1
                        if tuples_added >= 1000:
                            pickle_data_sent = pickle_data_to_graphite(
                                smallListOfMetricTuples)
                            if pickle_data_sent:
                                data_points_sent += tuples_added
                                logger.info(
                                    'populate_metric_worker :: sent %s/%s of %s data points to Graphite via pickle for %s'
                                    %
                                    (str(tuples_added), str(data_points_sent),
                                     str(timeseries_length), metric))
                                sent_to_graphite += len(
                                    smallListOfMetricTuples)
                                smallListOfMetricTuples = []
                                tuples_added = 0
                            else:
                                logger.error(
                                    'error :: populate_metric_worker :: failed to send %s data points to Graphite via pickle for %s'
                                    % (str(tuples_added), metric))
                    if smallListOfMetricTuples:
                        tuples_to_send = len(smallListOfMetricTuples)
                        pickle_data_sent = pickle_data_to_graphite(
                            smallListOfMetricTuples)
                        if pickle_data_sent:
                            data_points_sent += tuples_to_send
                            logger.info(
                                'populate_metric_worker :: sent the last %s/%s of %s data points to Graphite via pickle for %s'
                                % (str(tuples_to_send), str(data_points_sent),
                                   str(timeseries_length), metric))
                        else:
                            logger.error(
                                'error :: populate_metric_worker :: failed to send the last %s data points to Graphite via pickle for %s'
                                % (str(tuples_to_send), metric))

            logger.info(
                'populate_metric_worker :: sent %s data points to Graphite for %s'
                % (str(sent_to_graphite), metric))
            try:
                skyline_metric = '%s.datapoints_sent_to_graphite' % (
                    skyline_app_graphite_namespace)
                # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host
                # graphyte.send(skyline_metric, float(sent_to_graphite), int(time()))
                send_graphite_metric(skyline_app, skyline_metric,
                                     float(sent_to_graphite))
                logger.info(
                    'populate_metric_worker :: submitted %s to Graphite for %s'
                    % (str(float(sent_to_graphite)), skyline_metric))
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: populate_metric_worker :: failed to send metric data to Graphite for %s'
                    % str(skyline_metric))

            has_value = False
            if value == 0.0:
                has_value = True
            if value == 0:
                has_value = True
            if value:
                has_value = True

            if timestamp and has_value:
                try:
                    # Update Redis flux key
                    cache_key = 'flux.last.%s' % metric
                    metric_data = [int(timestamp), float(value)]
                    self.redis_conn.set(cache_key, str(metric_data))
                    logger.info(
                        'populate_metric_worker :: set the metric Redis key - %s - %s'
                        % (cache_key, str(metric_data)))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to set Redis key - %s - %s'
                        % (cache_key, str(metric_data)))

                # Adding to the vista.fetcher.unique_metrics Redis set
                redis_set = 'vista.fetcher.unique_metrics'
                data = str(remote_target)
                try:
                    self.redis_conn.sadd(redis_set, data)
                    logger.info(
                        'populate_metric_worker :: added %s to Redis set %s' %
                        (remote_target, redis_set))
                except:
                    logger.info(traceback.format_exc())
                    logger.error(
                        'error :: populate_metric_worker :: failed to add %s to Redis set %s'
                        % (str(data), str(redis_set)))

            end_populating = int(time())
            seconds_to_run = end_populating - start_populating
            logger.info(
                'populate_metric_worker :: %s populated to Graphite in %s seconds'
                % (metric, str(seconds_to_run)))
Exemple #7
0
    def run(self):
        """
        Called when the process intializes.
        """

        logger.info('worker :: starting worker')

        last_sent_to_graphite = int(time())
        metrics_sent_to_graphite = 0

        # Populate API keys and tokens in memcache
        # python-2.x and python3.x handle while 1 and while True differently
        # while 1:
        running = True
        while running:
            # Make sure Redis is up
            redis_up = False
            while not redis_up:
                try:
                    redis_up = self.redis_conn.ping()
                except:
                    logger.error(
                        'worker :: cannot connect to redis at socket path %s' %
                        (settings.REDIS_SOCKET_PATH))
                    sleep(2)
                    # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings
                    #                      Branch #3262: py3
                    # Use get_redis_conn and get_redis_conn_decoded
                    # if settings.REDIS_PASSWORD:
                    #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
                    # else:
                    #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                    self.redis_conn = get_redis_conn(skyline_app)
                    self.redis_conn_decoded = get_redis_conn_decoded(
                        skyline_app)

            if LOCAL_DEBUG:
                try:
                    metric_data_queue_size = self.q.qsize()
                    logger.info(
                        'worker :: debug :: flux.httpMetricDataQueue queue size - %s'
                        % str(metric_data_queue_size))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to determine size of queue flux.httpMetricDataQueue'
                    )

            metric_data = None
            try:
                # Get a metric from the queue with a 1 second timeout, each
                # metric item on the queue is a list e.g.
                # metric_data = [metricName, metricValue, metricTimestamp]
                metric_data = self.q.get(True, 1)

            except Empty:
                logger.info('worker :: queue is empty and timed out')
                sleep(1)
            except NotImplementedError:
                pass
            except KeyboardInterrupt:
                logger.info(
                    'worker :: server has been issued a user signal to terminate - KeyboardInterrupt'
                )
            except SystemExit:
                logger.info('worker :: server was interrupted - SystemExit')
            except Exception as e:
                logger.error('error :: worker :: %s' % (str(e)))

            # @added 20200206 - Feature #3444: Allow flux to backfill
            # Added backfill
            backfill = False

            if metric_data:
                try:
                    metric = str(metric_data[0])
                    value = float(metric_data[1])
                    timestamp = int(metric_data[2])
                    # @added 20200206 - Feature #3444: Allow flux to backfill
                    # Added backfill
                    backfill = int(metric_data[3])
                    if LOCAL_DEBUG:
                        logger.info(
                            'worker :: debug :: queue item found - %s' %
                            str(metric_data))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to interpolate metric, value, timestamp from metric_data - %s'
                        % str(metric_data))
                    continue

                if settings.FLUX_SEND_TO_CARBON:
                    # Best effort de-duplicate the data
                    valid_data = True

                    # @added 20200818 - Feature #3694: flux - POST multiple metrics
                    # Handle Redis and literal_eval separately
                    redis_last_metric_data = None

                    # @modified 20200206 - Feature #3444: Allow flux to backfill
                    # Only check flux.last key if this is not backfill
                    if not backfill:
                        cache_key = 'flux.last.%s' % metric
                        last_metric_timestamp = None
                        try:
                            # @modified 20191128 - Bug #3266: py3 Redis binary objects not strings
                            #                      Branch #3262: py3
                            # redis_last_metric_data = self.redis_conn.get(cache_key)
                            redis_last_metric_data = self.redis_conn_decoded.get(
                                cache_key)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to determine last_metric_timestamp from Redis key %s'
                                % str(cache_key))
                            redis_last_metric_data = None

                        # @modified 20200818 - Feature #3694: flux - POST multiple metrics
                        # Handle Redis and literal_eval separately, only
                        # literal_eval if Redis had data for the key
                        if redis_last_metric_data:
                            try:
                                last_metric_data = literal_eval(
                                    redis_last_metric_data)
                                last_metric_timestamp = int(
                                    last_metric_data[0])
                                if LOCAL_DEBUG:
                                    logger.info(
                                        'worker :: debug :: last_metric_timestamp for %s from %s is %s'
                                        % (metric, str(cache_key),
                                           str(last_metric_timestamp)))
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: worker :: failed to determine last_metric_timestamp from Redis key %s'
                                    % str(cache_key))
                                last_metric_timestamp = False

                        if last_metric_timestamp:
                            if timestamp <= last_metric_timestamp:
                                valid_data = False
                                if LOCAL_DEBUG:
                                    logger.info(
                                        'worker :: debug :: not valid data - the queue data timestamp %s is <= to the last_metric_timestamp %s for %s'
                                        % (str(timestamp),
                                           str(last_metric_timestamp), metric))

                    if valid_data:
                        submittedToGraphite = False
                        try:
                            graphyte.send(metric, value, timestamp)
                            submittedToGraphite = True
                            logger.info(
                                'worker :: sent %s, %s, %s to Graphite' %
                                (str(metric), str(value), str(timestamp)))
                            metrics_sent_to_graphite += 1
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to send metric data to Graphite for %s'
                                % str(metric))
                            metric = None
                        if submittedToGraphite:
                            # Update the metric Redis flux key
                            # @modified 20200206 - Feature #3444: Allow flux to backfill
                            # Only update the flux.last key if this is not backfill
                            if not backfill:
                                metric_data = [timestamp, value]
                                self.redis_conn.set(cache_key,
                                                    str(metric_data))
                            # @added 20200213 - Bug #3448: Repeated airgapped_metrics
                            else:
                                # @added 20200213 - Bug #3448: Repeated airgapped_metrics
                                # Add a flux.filled key to Redis with a expiry
                                # set to FULL_DURATION so that Analyzer knows to
                                # sort and deduplicate the Redis time series
                                # data as carbon-relay will send it to Horizon
                                # and the datapoints will be out of order in the
                                # Redis key
                                try:
                                    flux_filled_key = 'flux.filled.%s' % str(
                                        metric)
                                    self.redis_conn.setex(
                                        flux_filled_key,
                                        settings.FULL_DURATION, int(time()))
                                    logger.info('worker :: set Redis key %s' %
                                                (str(flux_filled_key)))
                                except Exception as e:
                                    logger.error(
                                        'error :: failed to could not set Redis flux.filled key: %s'
                                        % e)
                    else:
                        logger.info(
                            'worker :: discarded %s, %s, %s as a data point for %s has already been submitted to Graphite'
                            % (str(metric), str(value), str(timestamp),
                               str(timestamp)))
                else:
                    logger.info(
                        'worker :: settings.FLUX_SEND_TO_CARBON is set to %s, discarded %s, %s, %s'
                        % (str(settings.FLUX_SEND_TO_CARBON), str(metric),
                           str(value), str(timestamp)))

                if settings.FLUX_SEND_TO_STATSD:
                    statsd_conn.incr(metric, value, timestamp)
                    logger.info('worker sent %s, %s, %s to statsd' %
                                (metric, str(value), str(timestamp)))

            time_now = int(time())
            if (time_now - last_sent_to_graphite) >= 60:
                logger.info(
                    'worker :: metrics_sent_to_graphite in last 60 seconds - %s'
                    % str(metrics_sent_to_graphite))
                skyline_metric = '%s.metrics_sent_to_graphite' % skyline_app_graphite_namespace
                try:
                    # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host
                    # graphyte.send(skyline_metric, metrics_sent_to_graphite, time_now)
                    send_graphite_metric(skyline_app, skyline_metric,
                                         metrics_sent_to_graphite)
                    last_sent_to_graphite = int(time())
                    metrics_sent_to_graphite = 0
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to send_graphite_metric %s with %s'
                        % (skyline_metric, str(metrics_sent_to_graphite)))
Exemple #8
0
    def listen_pickle(self):
        """
        Listen for pickles over tcp
        """
        while 1:
            try:
                # Set up the TCP listening socket
                s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
                s.bind((self.ip, self.port))
                s.setblocking(1)
                s.listen(5)
                logger.info('%s :: listening over tcp for pickles on %s' %
                            (skyline_app, str(self.port)))

                (conn, address) = s.accept()
                logger.info('%s :: connection from %s on %s' %
                            (skyline_app, str(address[0]), str(self.port)))

                chunk = []
                while 1:
                    self.check_if_parent_is_alive()
                    try:
                        # @modified 20191016 - Task #3278: py3 handle bytes and not str in pickles
                        #                      Branch #3262: py3
                        # Added ability to log and debug
                        if LOCAL_DEBUG:
                            length = None
                            body = None
                            try:
                                length = Struct('!I').unpack(
                                    self.read_all(conn, 4))
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: listen :: could not determine length'
                                )
                            if length:
                                try:
                                    body = self.read_all(conn, length[0])
                                except:
                                    logger.error(traceback.format_exc())
                                    logger.error(
                                        'error :: listen :: could not determine body'
                                    )
                        else:
                            length = Struct('!I').unpack(self.read_all(
                                conn, 4))
                            body = self.read_all(conn, length[0])

                        # Iterate and chunk each individual datapoint
                        for bunch in self.gen_unpickle(body):
                            for metric in bunch:
                                chunk.append(metric)

                                # Queue the chunk and empty the variable
                                if len(chunk) > settings.CHUNK_SIZE:
                                    try:
                                        self.q.put(list(chunk), block=False)
                                        chunk[:] = []

                                    # Drop chunk if queue is full
                                    except Full:
                                        chunks_dropped = str(len(chunk))
                                        logger.info(
                                            '%s :: pickle queue is full, dropping %s datapoints'
                                            % (skyline_app, chunks_dropped))
                                        #                                        self.send_graphite_metric(
                                        #                                            'skyline.horizon.' + SERVER_METRIC_PATH + 'pickle_chunks_dropped',
                                        #                                            chunks_dropped)
                                        send_metric_name = '%s.pickle_chunks_dropped' % skyline_app_graphite_namespace
                                        send_graphite_metric(
                                            skyline_app, send_metric_name,
                                            chunks_dropped)
                                        chunk[:] = []

                    except Exception as e:
                        logger.info(e)
                        logger.info(
                            '%s :: incoming pickle connection dropped, attempting to reconnect'
                            % skyline_app)
                        break

            except Exception as e:
                logger.info('%s :: can not connect to socket: %s' %
                            (skyline_app, str(e)))
                break
Exemple #9
0
    def run(self):
        """
        Called when the process intializes.
        """

        logger.info('worker :: starting worker')

        last_sent_to_graphite = int(time())
        metrics_sent_to_graphite = 0

        # Populate API keys and tokens in memcache
        # python-2.x and python3.x handle while 1 and while True differently
        # while 1:
        running = True
        while running:
            # Make sure Redis is up
            redis_up = False
            while not redis_up:
                try:
                    redis_up = self.redis_conn.ping()
                except:
                    logger.error('worker :: cannot connect to redis at socket path %s' % (settings.REDIS_SOCKET_PATH))
                    sleep(2)
                    if settings.REDIS_PASSWORD:
                        self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
                    else:
                        self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
            metric_data = None
            try:
                # Get a metric from the queue with a 1 second timeout, each
                # metric item on the queue is a list e.g.
                # metric_data = [metricName, metricValue, metricTimestamp]
                metric_data = self.q.get(True, 1)

            except Empty:
                logger.info('worker :: queue is empty and timed out')
                sleep(1)
            except NotImplementedError:
                pass
            except KeyboardInterrupt:
                logger.info('worker :: server has been issued a user signal to terminate - KeyboardInterrupt')
            except SystemExit:
                logger.info('worker :: server was interrupted - SystemExit')
            except Exception as e:
                logger.error('error :: worker :: %s' % (str(e)))

            if metric_data:
                try:
                    metric = str(metric_data[0])
                    value = float(metric_data[1])
                    timestamp = int(metric_data[2])
                except:
                    logger.error(traceback.format_exc())
                    logger.error('error :: worker :: failed to interpolate metric, value, timestamp from metric_data - %s' % str(metric_data))
                    continue

                if settings.FLUX_SEND_TO_CARBON:
                    # Best effort de-duplicate the data
                    valid_data = True
                    cache_key = 'flux.last.%s' % metric
                    last_metric_timestamp = None
                    try:
                        redis_last_metric_data = self.redis_conn.get(cache_key)
                        last_metric_data = literal_eval(redis_last_metric_data)
                        last_metric_timestamp = int(last_metric_data[0])
                    except:
                        last_metric_timestamp = False
                    if last_metric_timestamp:
                        if timestamp <= last_metric_timestamp:
                            valid_data = False
                    if valid_data:
                        submittedToGraphite = False
                        try:
                            graphyte.send(metric, value, timestamp)
                            submittedToGraphite = True
                            logger.info('worker :: sent %s, %s, %s to Graphite' % (metric, str(value), str(timestamp)))
                            metrics_sent_to_graphite += 1
                        except:
                            logger.error(traceback.format_exc())
                            logger.error('error :: worker :: failed to send metric data to Graphite for %s' % str(metric))
                            metric = None
                        if submittedToGraphite:
                            # Update the metric Redis flux key
                            metric_data = [timestamp, value]
                            self.redis_conn.set(cache_key, str(metric_data))
                    else:
                        logger.info('worker discarded %s, %s, %s as a data point for %s has already been submitted to Graphite' % (
                            metric, str(value), str(timestamp), str(timestamp)))

                if settings.FLUX_SEND_TO_STATSD:
                    statsd_conn.incr(metric, value, timestamp)
                    logger.info('worker sent %s, %s, %s to statsd' % (metric, str(value), str(timestamp)))

            time_now = int(time())
            if (time_now - last_sent_to_graphite) >= 60:
                logger.info('worker :: metrics_sent_to_graphite in last 60 seconds - %s' % str(metrics_sent_to_graphite))
                skyline_metric = '%s.metrics_sent_to_graphite' % skyline_app_graphite_namespace
                try:
                    # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host
                    # graphyte.send(skyline_metric, metrics_sent_to_graphite, time_now)
                    send_graphite_metric(skyline_app, skyline_metric, metrics_sent_to_graphite)
                    last_sent_to_graphite = int(time())
                    metrics_sent_to_graphite = 0
                except:
                    logger.error(traceback.format_exc())
                    logger.error('error :: worker :: failed to send_graphite_metric %s with %s' % (
                        skyline_metric, str(metrics_sent_to_graphite)))
Exemple #10
0
    def run(self):
        """
        Called when the process intializes.
        """
        def pickle_data_to_graphite(data):

            message = None
            try:
                payload = pickle.dumps(data, protocol=2)
                header = struct.pack("!L", len(payload))
                message = header + payload
            except:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: worker :: failed to pickle to send to Graphite')
                return False
            if message:
                try:
                    sock = socket.socket()
                    sock.connect(
                        (CARBON_HOST, settings.FLUX_CARBON_PICKLE_PORT))
                    sock.sendall(message)
                    sock.close()
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to send pickle data to Graphite'
                    )
                    return False
            else:
                logger.error(traceback.format_exc())
                logger.error(
                    'error :: worker :: failed to pickle metric data into message'
                )
                return False
            return True

        def submit_pickle_data_to_graphite(pickle_data):

            # @modified 20201207 - Task #3864: flux - try except everything
            try:
                number_of_datapoints = len(pickle_data)
            except Exception as e:
                logger.error(
                    'error :: worker :: could not determine number_of_datapoints from len(pickle_data) - %s'
                    % str(e))
                return False

            data_points_sent = 0
            smallListOfMetricTuples = []
            tuples_added = 0

            for data in pickle_data:
                # @modified 20201207 - Task #3864: flux - try except everything
                try:
                    smallListOfMetricTuples.append(data)
                    tuples_added += 1
                    if tuples_added >= 480:
                        # @modified 20201207 - Task #3864: flux - try except everything
                        try:
                            pickle_data_sent = pickle_data_to_graphite(
                                smallListOfMetricTuples)
                        except Exception as e:
                            logger.error(
                                'error :: worker :: pickle_data_to_graphite error - %s'
                                % str(e))
                            pickle_data_sent = False

                        # Reduce the speed of submissions to Graphite
                        # if there are lots of data points
                        if number_of_datapoints > 4000:
                            sleep(0.3)
                        if pickle_data_sent:
                            data_points_sent += tuples_added
                            logger.info(
                                'worker :: sent %s/%s of %s data points to Graphite via pickle'
                                % (str(tuples_added), str(data_points_sent),
                                   str(number_of_datapoints)))
                            smallListOfMetricTuples = []
                            tuples_added = 0
                        else:
                            logger.error(
                                'error :: worker :: failed to send %s data points to Graphite via pickle'
                                % (str(tuples_added)))
                            return False
                except Exception as e:
                    logger.error(
                        'error :: worker :: error handling data in pickle_data - %s'
                        % str(e))
                    return False

            if smallListOfMetricTuples:
                # @modified 20201207 - Task #3864: flux - try except everything
                try:
                    tuples_to_send = len(smallListOfMetricTuples)
                    pickle_data_sent = pickle_data_to_graphite(
                        smallListOfMetricTuples)
                    if pickle_data_sent:
                        data_points_sent += tuples_to_send
                        logger.info(
                            'worker :: sent the last %s/%s of %s data points to Graphite via pickle'
                            % (str(tuples_to_send), str(data_points_sent),
                               str(number_of_datapoints)))
                    else:
                        logger.error(
                            'error :: failed to send the last %s data points to Graphite via pickle'
                            % (str(tuples_to_send)))
                        return False
                except Exception as e:
                    logger.error(
                        'error :: worker :: error in smallListOfMetricTuples pickle_data_to_graphite - %s'
                        % str(e))
                    return False

            return True

        logger.info('worker :: starting worker')

        last_sent_to_graphite = int(time())
        metrics_sent_to_graphite = 0

        # @added 20200827 - Feature #3708: FLUX_ZERO_FILL_NAMESPACES
        last_zero_fill_to_graphite = 0
        metrics_sent = []

        remove_from_flux_queue_redis_set = []

        # @added 20201019 - Feature #3790: flux - pickle to Graphite
        pickle_data = []
        # send_to_reciever = 'line'
        send_to_reciever = 'pickle'

        # @modified 20201207 - Task #3864: flux - try except everything
        try:
            metric_data_queue_size = self.q.qsize()
        except Exception as e:
            logger.error(
                'error :: worker :: could not determine metric_data_queue_size - %s'
                % str(e))
            metric_data_queue_size = 0

        if metric_data_queue_size > 10:
            send_to_reciever = 'pickle'

        # @added 202011120 - Feature #3790: flux - pickle to Graphite
        # Debug Redis set
        metrics_data_sent = []

        # @added 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP
        # Even if flux.last Redis keys are disabled in flux they are used in
        # Vista
        vista_metrics = []
        if not FLUX_CHECK_LAST_TIMESTAMP and VISTA_ENABLED:
            try:
                vista_metrics = list(
                    self.redis_conn_decoded.sscan_iter('vista.metrics',
                                                       match='*'))
            except:
                vista_metrics = []

        # Populate API keys and tokens in memcache
        # python-2.x and python3.x handle while 1 and while True differently
        # while 1:
        running = True
        while running:
            # Make sure Redis is up
            redis_up = False
            while not redis_up:
                try:
                    redis_up = self.redis_conn.ping()
                except:
                    logger.error(
                        'worker :: cannot connect to redis at socket path %s' %
                        (settings.REDIS_SOCKET_PATH))
                    sleep(2)
                    # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings
                    #                      Branch #3262: py3
                    # Use get_redis_conn and get_redis_conn_decoded
                    # if settings.REDIS_PASSWORD:
                    #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
                    # else:
                    #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                    # @modified 20201207 - Task #3864: flux - try except everything
                    try:
                        self.redis_conn = get_redis_conn(skyline_app)
                    except Exception as e:
                        logger.error(
                            'error :: worker :: could not get_redis_conn - %s'
                            % str(e))
                    try:
                        self.redis_conn_decoded = get_redis_conn_decoded(
                            skyline_app)
                    except Exception as e:
                        logger.error(
                            'error :: worker :: could not get_redis_conn_decoded - %s'
                            % str(e))

            if LOCAL_DEBUG:
                try:
                    metric_data_queue_size = self.q.qsize()
                    logger.info(
                        'worker :: debug :: flux.httpMetricDataQueue queue size - %s'
                        % str(metric_data_queue_size))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to determine size of queue flux.httpMetricDataQueue'
                    )

            metric_data = None
            try:
                # Get a metric from the queue with a 1 second timeout, each
                # metric item on the queue is a list e.g.
                # metric_data = [metricName, metricValue, metricTimestamp]
                metric_data = self.q.get(True, 1)

            except Empty:
                if pickle_data:
                    # @modified 20201207 - Task #3864: flux - try except everything
                    try:
                        pickle_data_submitted = submit_pickle_data_to_graphite(
                            pickle_data)
                    except Exception as e:
                        logger.error(
                            'error :: worker :: queue Empty failed to submit_pickle_data_to_graphite - %s'
                            % str(e))
                        pickle_data_submitted = False

                    if pickle_data_submitted:
                        pickle_data = []
                logger.info('worker :: queue is empty and timed out')
                sleep(1)
                # @added 20201017 - Feature #3788: snab_flux_load_test
                # Send to Graphite even if worker gets no metrics
                if (int(time()) - last_sent_to_graphite) >= 60:
                    logger.info(
                        'worker :: metrics_sent_to_graphite in last 60 seconds - %s'
                        % str(metrics_sent_to_graphite))
                    skyline_metric = '%s.metrics_sent_to_graphite' % skyline_app_graphite_namespace
                    try:
                        # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host
                        # graphyte.send(skyline_metric, metrics_sent_to_graphite, time_now)
                        send_graphite_metric(skyline_app, skyline_metric,
                                             metrics_sent_to_graphite)
                        last_sent_to_graphite = int(time())
                        metrics_sent_to_graphite = 0
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to send_graphite_metric %s with %s'
                            % (skyline_metric, str(metrics_sent_to_graphite)))
                    metric_data_queue_size = 0
                    try:
                        metric_data_queue_size = self.q.qsize()
                        logger.info(
                            'worker :: flux.httpMetricDataQueue queue size - %s'
                            % str(metric_data_queue_size))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to determine size of queue flux.httpMetricDataQueue'
                        )
                    skyline_metric = '%s.httpMetricDataQueue.size' % skyline_app_graphite_namespace
                    try:
                        send_graphite_metric(skyline_app, skyline_metric,
                                             metric_data_queue_size)
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to send_graphite_metric %s with %s'
                            % (skyline_metric, str(metrics_sent_to_graphite)))
                    # @added 20201019 - Feature #3790: flux - pickle to Graphite
                    if metric_data_queue_size > 10:
                        send_to_reciever = 'pickle'
                    else:
                        send_to_reciever = 'line'
                    send_to_reciever = 'pickle'

                    # @added 202011120 - Feature #3790: flux - pickle to Graphite
                    # Debug Redis set
                    metrics_data_sent_strs = []
                    for item in metrics_data_sent:
                        metrics_data_sent_strs.append(str(item))
                    if metrics_data_sent_strs:
                        try:
                            self.redis_conn.sadd('flux.metrics_data_sent',
                                                 *set(metrics_data_sent_strs))
                            logger.info(
                                'worker :: added %s items to the flux.metrics_data_sent Redis set'
                                % str(len(metrics_data_sent)))
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to determine size of flux.queue Redis set'
                            )
                        metrics_data_sent = []
                        try:
                            new_set = 'aet.flux.metrics_data_sent.%s' % str(
                                self.current_pid)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to current_pid for aet.flux.metrics_data_sent Redis set name'
                            )
                            new_set = 'aet.flux.metrics_data_sent'
                        try:
                            self.redis_conn.rename('flux.metrics_data_sent',
                                                   new_set)
                            logger.info(
                                'worker :: renamed flux.metrics_data_sent Redis set to %s'
                                % new_set)
                        # @added 20201128 - Feature #3820: HORIZON_SHARDS
                        # With metrics that come in at a frequency of less
                        # than 60 seconds, it is possible that this key will
                        # not exist as flux has not been sent metric data
                        # so this operation will error with no such key
                        except Exception as e:
                            traceback_str = traceback.format_exc()
                            if 'no such key' in e:
                                logger.warn(
                                    'warning :: worker :: failed to rename flux.metrics_data_sent to %s Redis set - flux has not recieved data in 60 seconds - %s'
                                    % (new_set, e))
                            else:
                                logger.error(traceback_str)
                                logger.error(
                                    'error :: worker :: failed to rename flux.metrics_data_sent to %s Redis set'
                                    % new_set)
                        try:
                            self.redis_conn.expire(new_set, 600)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to set 600 seconds TTL on %s Redis set'
                                % new_set)

                    # @added 20201018 - Feature #3798: FLUX_PERSIST_QUEUE
                    if FLUX_PERSIST_QUEUE:
                        redis_set_size = 0
                        try:
                            redis_set_size = self.redis_conn.scard(
                                'flux.queue')
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to determine size of flux.queue Redis set'
                            )
                        logger.info(
                            'worker - flux.queue Redis set size of %s before removal of %s items'
                            % (str(redis_set_size),
                               str(len(remove_from_flux_queue_redis_set))))
                        if remove_from_flux_queue_redis_set:
                            try:
                                self.redis_conn.srem(
                                    'flux.queue',
                                    *set(remove_from_flux_queue_redis_set))
                                remove_from_flux_queue_redis_set = []
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: worker :: failed to remove multiple items from flux.queue Redis set'
                                )
                            try:
                                redis_set_size = self.redis_conn.scard(
                                    'flux.queue')
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: worker :: failed to determine size of flux.queue Redis set'
                                )
                            logger.info(
                                'worker - flux.queue Redis set size of %s after the removal of items'
                                % (str(redis_set_size)))
                            remove_from_flux_queue_redis_set = []
                    # @added 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP
                    # Even if flux.last Redis keys are disabled in flux they are used in
                    # Vista
                    vista_metrics = []
                    if not FLUX_CHECK_LAST_TIMESTAMP and VISTA_ENABLED:
                        try:
                            vista_metrics = list(
                                self.redis_conn_decoded.sscan_iter(
                                    'vista.metrics', match='*'))
                        except:
                            vista_metrics = []
            except NotImplementedError:
                pass
            except KeyboardInterrupt:
                logger.info(
                    'worker :: server has been issued a user signal to terminate - KeyboardInterrupt'
                )
            except SystemExit:
                logger.info('worker :: server was interrupted - SystemExit')
            except Exception as e:
                logger.error('error :: worker :: %s' % (str(e)))

            # @added 20200206 - Feature #3444: Allow flux to backfill
            # Added backfill
            backfill = False

            # @added 20201018 - Feature #3798: FLUX_PERSIST_QUEUE
            if metric_data and FLUX_PERSIST_QUEUE:
                try:
                    # Do not remove each individual metrics from the flux.queue
                    # Redis set, add to a list that is removed in one srem *set
                    # operation each 60 seconds.  This is a more perfomant
                    # method and requires a single blocking call for a batch of
                    # metrics, rather than a blocking call for every metric.
                    # self.redis_conn.srem('flux.queue', str(metric_data))
                    remove_from_flux_queue_redis_set.append(str(metric_data))
                except:
                    pass

            if metric_data:
                try:
                    metric = str(metric_data[0])
                    value = float(metric_data[1])
                    timestamp = int(metric_data[2])
                    # @added 20200206 - Feature #3444: Allow flux to backfill
                    # Added backfill
                    backfill = int(metric_data[3])
                    if LOCAL_DEBUG:
                        logger.info(
                            'worker :: debug :: queue item found - %s' %
                            str(metric_data))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to interpolate metric, value, timestamp from metric_data - %s'
                        % str(metric_data))
                    continue

                # @added 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP
                # Only check flux.last key if this is not backfill and
                # FLUX_CHECK_LAST_TIMESTAMP is enable or it is in VISTA_ENABLED
                cache_key = None
                # if FLUX_CHECK_LAST_TIMESTAMP:
                cache_key = 'flux.last.%s' % metric
                check_flux_last_key = False
                if not backfill and FLUX_CHECK_LAST_TIMESTAMP:
                    check_flux_last_key = True
                if VISTA_ENABLED:
                    if metric in vista_metrics:
                        check_flux_last_key = True

                if settings.FLUX_SEND_TO_CARBON:
                    # Best effort de-duplicate the data
                    valid_data = True

                    # @added 20200818 - Feature #3694: flux - POST multiple metrics
                    # Handle Redis and literal_eval separately
                    redis_last_metric_data = None

                    # @modified 20200206 - Feature #3444: Allow flux to backfill
                    # Only check flux.last key if this is not backfill
                    # @modified 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP
                    # Use the check_flux_last_key value determined above
                    # if not backfill:
                    if check_flux_last_key:
                        # @modified 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP
                        # Set cache_key outside the conditional block
                        # cache_key = 'flux.last.%s' % metric
                        last_metric_timestamp = None
                        try:
                            # @modified 20191128 - Bug #3266: py3 Redis binary objects not strings
                            #                      Branch #3262: py3
                            # redis_last_metric_data = self.redis_conn.get(cache_key)
                            redis_last_metric_data = self.redis_conn_decoded.get(
                                cache_key)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to determine last_metric_timestamp from Redis key %s'
                                % str(cache_key))
                            redis_last_metric_data = None

                        # @modified 20200818 - Feature #3694: flux - POST multiple metrics
                        # Handle Redis and literal_eval separately, only
                        # literal_eval if Redis had data for the key
                        if redis_last_metric_data:
                            try:
                                last_metric_data = literal_eval(
                                    redis_last_metric_data)
                                last_metric_timestamp = int(
                                    last_metric_data[0])
                                if LOCAL_DEBUG:
                                    logger.info(
                                        'worker :: debug :: last_metric_timestamp for %s from %s is %s'
                                        % (metric, str(cache_key),
                                           str(last_metric_timestamp)))
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: worker :: failed to determine last_metric_timestamp from Redis key %s'
                                    % str(cache_key))
                                last_metric_timestamp = False

                        if last_metric_timestamp:
                            if timestamp <= last_metric_timestamp:
                                valid_data = False
                                if LOCAL_DEBUG:
                                    logger.info(
                                        'worker :: debug :: not valid data - the queue data timestamp %s is <= to the last_metric_timestamp %s for %s'
                                        % (str(timestamp),
                                           str(last_metric_timestamp), metric))

                    if valid_data:
                        submittedToGraphite = False
                        if send_to_reciever == 'line':
                            try:
                                graphyte.send(metric, value, timestamp)
                                submittedToGraphite = True
                                # modified 20201016 - Feature #3788: snab_flux_load_test
                                if FLUX_VERBOSE_LOGGING:
                                    logger.info(
                                        'worker :: sent %s, %s, %s to Graphite - via graphyte'
                                        % (str(metric), str(value),
                                           str(timestamp)))
                                metrics_sent_to_graphite += 1
                                # @added 20200827 - Feature #3708: FLUX_ZERO_FILL_NAMESPACES
                                metrics_sent.append(metric)
                                # @added 202011120 - Feature #3790: flux - pickle to Graphite
                                # Debug Redis set
                                metrics_data_sent.append(
                                    [metric, value, timestamp])
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: worker :: failed to send metric data to Graphite for %s'
                                    % str(metric))
                                metric = None
                        if send_to_reciever == 'pickle':
                            # @modified 20201212 - Task #3864: flux - try except everything
                            try:
                                tuple_data = (metric, (int(timestamp),
                                                       float(value)))
                                pickle_data.append(tuple_data)
                                if FLUX_VERBOSE_LOGGING:
                                    logger.info(
                                        'worker :: sending %s, %s, %s to Graphite - via pickle'
                                        % (str(metric), str(value),
                                           str(timestamp)))
                                submittedToGraphite = True
                                metrics_sent_to_graphite += 1
                                metrics_sent.append(metric)
                                # @added 202011120 - Feature #3790: flux - pickle to Graphite
                                # Debug Redis set
                                metrics_data_sent.append(
                                    [metric, value, timestamp])
                            except Exception as e:
                                logger.error(
                                    'error :: worker :: failed to append to pickle_data - %s'
                                    % str(e))

                        if submittedToGraphite:
                            # Update the metric Redis flux key
                            # @modified 20200206 - Feature #3444: Allow flux to backfill
                            # Only update the flux.last key if this is not backfill
                            # @modified 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP
                            # Use the check_flux_last_key value determined above
                            # if not backfill:
                            if check_flux_last_key:
                                metric_data = [timestamp, value]

                                # @modified 20201207 - Task #3864: flux - try except everything
                                try:
                                    self.redis_conn.set(
                                        cache_key, str(metric_data))
                                except Exception as e:
                                    logger.error(
                                        'error :: worker :: failed to set check_flux_last_key Redis key - %s'
                                        % str(e))

                            # @added 20200213 - Bug #3448: Repeated airgapped_metrics
                            else:
                                # @added 20201120 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP
                                #                   Feature #3400: Identify air gaps in the metric data
                                # Only execute if IDENTIFY_AIRGAPS is enabled
                                if IDENTIFY_AIRGAPS:
                                    # @added 20200213 - Bug #3448: Repeated airgapped_metrics
                                    # Add a flux.filled key to Redis with a expiry
                                    # set to FULL_DURATION so that Analyzer knows to
                                    # sort and deduplicate the Redis time series
                                    # data as carbon-relay will send it to Horizon
                                    # and the datapoints will be out of order in the
                                    # Redis key
                                    try:
                                        flux_filled_key = 'flux.filled.%s' % str(
                                            metric)
                                        self.redis_conn.setex(
                                            flux_filled_key,
                                            settings.FULL_DURATION,
                                            int(time()))
                                        logger.info(
                                            'worker :: set Redis key %s' %
                                            (str(flux_filled_key)))
                                    except Exception as e:
                                        logger.error(
                                            'error :: failed to could not set Redis flux.filled key: %s'
                                            % e)
                    else:
                        # modified 20201016 - Feature #3788: snab_flux_load_test
                        if FLUX_VERBOSE_LOGGING:
                            logger.info(
                                'worker :: discarded %s, %s, %s as a data point for %s has already been submitted to Graphite'
                                % (str(metric), str(value), str(timestamp),
                                   str(timestamp)))
                else:
                    logger.info(
                        'worker :: settings.FLUX_SEND_TO_CARBON is set to %s, discarded %s, %s, %s'
                        % (str(settings.FLUX_SEND_TO_CARBON), str(metric),
                           str(value), str(timestamp)))

                if settings.FLUX_SEND_TO_STATSD:
                    statsd_conn.incr(metric, value, timestamp)
                    # modified 20201016 - Feature #3788: snab_flux_load_test
                    if FLUX_VERBOSE_LOGGING:
                        logger.info('worker sent %s, %s, %s to statsd' %
                                    (metric, str(value), str(timestamp)))
                    # @added 20200827 - Feature #3708: FLUX_ZERO_FILL_NAMESPACES
                    metrics_sent.append(metric)

                submit_pickle_data = False
                if pickle_data:
                    number_of_datapoints = len(pickle_data)
                    if number_of_datapoints >= 1000:
                        submit_pickle_data = True
                    else:
                        try:
                            metric_data_queue_size = self.q.qsize()
                        except:
                            metric_data_queue_size = 0
                        if metric_data_queue_size == 0:
                            submit_pickle_data = True
                if submit_pickle_data:
                    # @modified 20201207 - Task #3864: flux - try except everything
                    try:
                        pickle_data_submitted = submit_pickle_data_to_graphite(
                            pickle_data)
                    except Exception as e:
                        logger.error(
                            'error :: worker :: submit_pickle_data_to_graphite failed - %s'
                            % str(e))
                        pickle_data_submitted = False

                    if pickle_data_submitted:
                        pickle_data = []

            time_now = int(time())

            # @added 20200827 - Feature #3708: FLUX_ZERO_FILL_NAMESPACES
            # Send 0 for any metric in the flux.zero_fill_metrics Redis set that
            # has not submitted data in the last 60 seconds.  The flux.last
            # Redis key is not updated for these sent 0 values so if the source
            # sends data for a timestamp in the period later (due to a lag, etc),
            # it will be valid and sent to Graphite.
            if FLUX_ZERO_FILL_NAMESPACES:
                if not last_zero_fill_to_graphite:
                    last_zero_fill_to_graphite = time_now - 60
                if (time_now - last_sent_to_graphite) >= 60:
                    try:
                        flux_zero_fill_metrics = list(
                            self.redis_conn_decoded.smembers(
                                'flux.zero_fill_metrics'))
                    except:
                        logger.info(traceback.format_exc())
                        logger.error(
                            'error :: failed to generate a list from flux.zero_fill_metrics Redis set'
                        )
                    for flux_zero_fill_metric in flux_zero_fill_metrics:
                        if flux_zero_fill_metric not in metrics_sent:
                            try:
                                graphyte.send(flux_zero_fill_metric, 0.0,
                                              time_now)
                                # modified 20201016 - Feature #3788: snab_flux_load_test
                                if FLUX_VERBOSE_LOGGING:
                                    logger.info(
                                        'worker :: zero fill - sent %s, %s, %s to Graphite'
                                        % (str(flux_zero_fill_metric),
                                           str(0.0), str(time_now)))
                                metrics_sent_to_graphite += 1
                                metrics_sent.append(metric)
                            except:
                                logger.error(traceback.format_exc())
                                logger.error(
                                    'error :: worker :: zero fill - failed to send metric data to Graphite for %s'
                                    % str(flux_zero_fill_metric))
                                metric = None
                    last_zero_fill_to_graphite = time_now
                    metrics_sent = []

            if (time_now - last_sent_to_graphite) >= 60:
                if pickle_data:
                    # @modified 20201207 - Task #3864: flux - try except everything
                    try:
                        pickle_data_submitted = submit_pickle_data_to_graphite(
                            pickle_data)
                    except Exception as e:
                        logger.error(
                            'error :: worker :: submit_pickle_data_to_graphite failed last_sent_to_graphite >= 60 - %s'
                            % str(e))
                        pickle_data_submitted = False

                    if pickle_data_submitted:
                        pickle_data = []
                logger.info(
                    'worker :: metrics_sent_to_graphite in last 60 seconds - %s'
                    % str(metrics_sent_to_graphite))
                skyline_metric = '%s.metrics_sent_to_graphite' % skyline_app_graphite_namespace
                try:
                    # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host
                    # graphyte.send(skyline_metric, metrics_sent_to_graphite, time_now)
                    send_graphite_metric(skyline_app, skyline_metric,
                                         metrics_sent_to_graphite)
                    last_sent_to_graphite = int(time())
                    metrics_sent_to_graphite = 0
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to send_graphite_metric %s with %s'
                        % (skyline_metric, str(metrics_sent_to_graphite)))
                metric_data_queue_size = 0
                try:
                    metric_data_queue_size = self.q.qsize()
                    logger.info(
                        'worker :: flux.httpMetricDataQueue queue size - %s' %
                        str(metric_data_queue_size))
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to determine size of queue flux.httpMetricDataQueue'
                    )
                skyline_metric = '%s.httpMetricDataQueue.size' % skyline_app_graphite_namespace
                try:
                    send_graphite_metric(skyline_app, skyline_metric,
                                         metric_data_queue_size)
                except:
                    logger.error(traceback.format_exc())
                    logger.error(
                        'error :: worker :: failed to send_graphite_metric %s with %s'
                        % (skyline_metric, str(metrics_sent_to_graphite)))
                # @added 20201019 - Feature #3790: flux - pickle to Graphite
                if metric_data_queue_size > 10:
                    send_to_reciever = 'pickle'
                else:
                    send_to_reciever = 'line'

                # @added 202011120 - Feature #3790: flux - pickle to Graphite
                # Debug Redis set
                metrics_data_sent_strs = []
                for item in metrics_data_sent:
                    metrics_data_sent_strs.append(str(item))
                if metrics_data_sent_strs:
                    try:
                        self.redis_conn.sadd('flux.metrics_data_sent',
                                             *set(metrics_data_sent_strs))
                        logger.info(
                            'worker :: added %s items to the flux.metrics_data_sent Redis set'
                            % str(len(metrics_data_sent)))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to determine size of flux.queue Redis set'
                        )
                    metrics_data_sent = []
                    try:
                        new_set = 'aet.flux.metrics_data_sent.%s' % str(
                            self.current_pid)
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to current_pid for aet.flux.metrics_data_sent Redis set name'
                        )
                        new_set = 'aet.flux.metrics_data_sent'
                    try:
                        self.redis_conn.rename('flux.metrics_data_sent',
                                               new_set)
                        logger.info(
                            'worker :: renamed flux.metrics_data_sent Redis set to %s'
                            % new_set)
                    # @modified 20201128 - Feature #3820: HORIZON_SHARDS
                    # With metrics that come in at a frequency of less
                    # than 60 seconds, it is possible that this key will
                    # not exist as flux has not been sent metric data
                    # so this operation will error with no such key
                    except Exception as e:
                        traceback_str = traceback.format_exc()
                        if 'no such key' in e:
                            logger.warn(
                                'warning :: worker :: failed to rename flux.metrics_data_sent to %s Redis set - flux has not recieved data in 60 seconds - %s'
                                % (new_set, e))
                        else:
                            logger.error(traceback_str)
                            logger.error(
                                'error :: worker :: failed to rename flux.metrics_data_sent to %s Redis set'
                                % new_set)

                    try:
                        self.redis_conn.expire(new_set, 600)
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to set 600 seconds TTL on %s Redis set'
                            % new_set)

                # @added 20201018 - Feature #3798: FLUX_PERSIST_QUEUE
                if FLUX_PERSIST_QUEUE:
                    redis_set_size = 0
                    try:
                        redis_set_size = self.redis_conn.scard('flux.queue')
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: worker :: failed to determine size of flux.queue Redis set'
                        )
                    logger.info(
                        'worker - flux.queue Redis set size %s before removal of %s items'
                        % (str(redis_set_size),
                           str(len(remove_from_flux_queue_redis_set))))
                    if remove_from_flux_queue_redis_set:
                        try:
                            self.redis_conn.srem(
                                'flux.queue',
                                *set(remove_from_flux_queue_redis_set))
                            remove_from_flux_queue_redis_set = []
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to remove multiple items from flux.queue Redis set'
                            )
                        try:
                            redis_set_size = self.redis_conn.scard(
                                'flux.queue')
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: worker :: failed to determine size of flux.queue Redis set'
                            )
                        logger.info(
                            'worker - flux.queue Redis set size of %s after the removal of items'
                            % (str(redis_set_size)))
                        remove_from_flux_queue_redis_set = []
                # @added 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP
                # Even if flux.last Redis keys are disabled in flux they are used in
                # Vista
                vista_metrics = []
                if not FLUX_CHECK_LAST_TIMESTAMP and VISTA_ENABLED:
                    try:
                        vista_metrics = list(
                            self.redis_conn_decoded.sscan_iter('vista.metrics',
                                                               match='*'))
                    except:
                        vista_metrics = []
Exemple #11
0
    def run(self):
        """
        Called when the process intializes.
        """

        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        if os.path.isfile(skyline_app_logwait):
            try:
                logger.info('removing %s' % skyline_app_logwait)
                os.remove(skyline_app_logwait)
            except OSError:
                logger.error('error :: failed to remove %s, continuing' %
                             skyline_app_logwait)
                pass

        now = time()
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = time()
            else:
                now = log_wait_for + 1

        logger.info('starting %s run' % skyline_app)
        if os.path.isfile(skyline_app_loglock):
            logger.error(
                'error :: bin/%s.d log management seems to have failed, continuing'
                % skyline_app)
            try:
                os.remove(skyline_app_loglock)
                logger.info('log lock file removed')
            except OSError:
                logger.error('error :: failed to remove %s, continuing' %
                             skyline_app_loglock)
                pass
        else:
            logger.info('bin/%s.d log management done' % skyline_app)

        # @added 20190417 - Feature #2948: LUMINOSITY_ENABLED setting
        # If Luminosity is not enabled, do nothing
        luminosity_enabled = True
        try:
            luminosity_enabled = settings.LUMINOSITY_ENABLED
            logger.info('LUMINOSITY_ENABLED is set to %s' %
                        str(luminosity_enabled))
        except:
            logger.info(
                'warning :: LUMINOSITY_ENABLED is not declared in settings.py, defaults to True'
            )

        # @added 20190417 - Feature #2950: Report defaulted settings to log
        # Added all the globally declared settings to enable reporting in the
        # log the state of each setting.
        try:
            ENABLE_LUMINOSITY_DEBUG = settings.ENABLE_LUMINOSITY_DEBUG
            logger.info(
                'ENABLE_LUMINOSITY_DEBUG is set from settings.py to %s' %
                str(ENABLE_LUMINOSITY_DEBUG))
        except:
            logger.info(
                'warning :: ENABLE_LUMINOSITY_DEBUG is not declared in settings.py, defaults to False'
            )
            ENABLE_LUMINOSITY_DEBUG = False
        try:
            SERVER_METRIC_PATH = '.%s' % settings.SERVER_METRICS_NAME
            if SERVER_METRIC_PATH == '.':
                SERVER_METRIC_PATH = ''
            logger.info('SERVER_METRIC_PATH is set from settings.py to %s' %
                        str(SERVER_METRIC_PATH))
        except:
            SERVER_METRIC_PATH = ''
            logger.info(
                'warning :: SERVER_METRIC_PATH is not declared in settings.py, defaults to \'\''
            )
        try:
            LUMINOSITY_PROCESSES = settings.LUMINOSITY_PROCESSES
            logger.info('LUMINOSITY_PROCESSES is set from settings.py to %s' %
                        str(LUMINOSITY_PROCESSES))
        except:
            # @modified 20180110 - Task #2266: Evaluate luminol for the luminosity branch
            # It is fast and lightweight
            # luminosity_processes = 2
            LUMINOSITY_PROCESSES = 1
            logger.info(
                'warning :: cannot determine LUMINOSITY_PROCESSES from settings.py, defaults to %s'
                % str(LUMINOSITY_PROCESSES))

        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
                if ENABLE_LUMINOSITY_DEBUG:
                    logger.info('debug :: connected to Redis')
            except:
                logger.error(
                    'error :: cannot connect to redis at socket path %s' %
                    (settings.REDIS_SOCKET_PATH))
                sleep(30)
                # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow
                # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings
                #                      Branch #3262: py3
                # Use get_redis_conn and get_redis_conn_decoded to use on Redis sets when the bytes
                # types need to be decoded as utf-8 to str
                # if settings.REDIS_PASSWORD:
                #     self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
                # else:
                #     self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                # @added 20191115 - Bug #3266: py3 Redis binary objects not strings
                #                   Branch #3262: py3
                self.redis_conn = get_redis_conn(skyline_app)
                self.redis_conn_decoded = get_redis_conn_decoded(skyline_app)

                continue

            # Report app up
            try:
                self.redis_conn.setex(skyline_app, 120, now)
                logger.info('updated Redis key for %s up' % skyline_app)
            except:
                logger.error('error :: failed to update Redis key for %s up' %
                             skyline_app)

            # @added 20190417 - Feature #: LUMINOSITY_ENABLED setting
            # If Luminosity is not enabled, do nothing
            if not luminosity_enabled:
                logger.info(
                    'luminosity is not enabled LUMINOSITY_ENABLED set to %s, sleeping for 20 seconds'
                    % str(settings.LUMINOSITY_ENABLED))
                sleep(20)
                continue
            """
            Determine if any new anomalies have been added
            """
            while True:
                process_anomaly_id = None
                last_processed_anomaly_id = None
                memcache_last_processed_anomaly_id_data = False
                # Check memcached before MySQL
                memcache_key = '%s.last.processed.anomaly.id' % skyline_app
                if settings.MEMCACHE_ENABLED:
                    try:
                        # @modified 20191029 - Task #3304: py3 - handle pymemcache bytes not str
                        # last_processed_anomaly_id = self.memcache_client.get(memcache_key)
                        if python_version == 2:
                            last_processed_anomaly_id = self.memcache_client.get(
                                memcache_key)
                        else:
                            last_processed_anomaly_id = self.memcache_client.get(
                                memcache_key).decode('utf-8')
                        # if memcache does not have the key the response to the
                        # client is None, it does not except
                    except:
                        # @modified 20200507 - stop reporting this as an error
                        # it can be expected to happen from time to time
                        # logger.error('error :: failed to get %s from memcache' % memcache_key)
                        logger.info(
                            'failed to get %s from memcache, will query DB' %
                            memcache_key)
                    try:
                        self.memcache_client.close()
                    except:
                        logger.error(
                            'error :: failed to close memcache_client')

                if last_processed_anomaly_id:
                    logger.info(
                        'last_processed_anomaly_id found in memcache - %s' %
                        str(last_processed_anomaly_id))
                    memcache_last_processed_anomaly_id_data = True
                else:
                    # @modified 20190517 - Bug #3016: Handle no anomaly ids in luminosity
                    #                      Branch #3002: docker
                    # Log appropriate to whether memcache is enabled or not
                    if settings.MEMCACHE_ENABLED:
                        logger.info(
                            'last_processed_anomaly_id key was NOT found in memcache - %s'
                            % str(last_processed_anomaly_id))
                    else:
                        logger.info(
                            'memcache not enabled not checking for last_processed_anomaly_id key'
                        )

                if not last_processed_anomaly_id:
                    query = 'SELECT id FROM luminosity WHERE id=(SELECT MAX(id) FROM luminosity) ORDER BY id DESC LIMIT 1'
                    results = None
                    try:
                        results = mysql_select(skyline_app, query)
                    except:
                        logger.error(traceback.format_exc())
                        logger.error('error :: MySQL quey failed - %s' % query)
                    if results:
                        try:
                            last_processed_anomaly_id = int(results[0][0])
                            logger.info(
                                'last_processed_anomaly_id found from DB - %s'
                                % str(last_processed_anomaly_id))
                        except:
                            logger.error(traceback.format_exc())

                        if last_processed_anomaly_id and settings.MEMCACHE_ENABLED:
                            if not memcache_last_processed_anomaly_id_data:
                                logger.info(
                                    'Populating memcache with DB result - %s' %
                                    str(last_processed_anomaly_id))
                                try:
                                    self.memcache_client.set(
                                        memcache_key,
                                        int(last_processed_anomaly_id))
                                    logger.info(
                                        'populated memcache key %s with %s' %
                                        (memcache_key,
                                         str(last_processed_anomaly_id)))
                                except:
                                    logger.error(
                                        'error :: failed to set  the memcache key - %s - %s'
                                        % (memcache_key,
                                           str(last_processed_anomaly_id)))
                                try:
                                    self.memcache_client.close()
                                except:
                                    logger.error(
                                        'error :: failed to close memcache_client'
                                    )

                if not last_processed_anomaly_id:
                    # Check MySQL
                    now = int(time())
                    after = now - 600
                    query = 'SELECT * FROM anomalies WHERE anomaly_timestamp > \'%s\'' % str(
                        after)  # nosec
                    results = None
                    try:
                        results = mysql_select(skyline_app, query)
                    except:
                        logger.error('error :: MySQL quey failed - %s' % query)
                    if results:
                        process_anomaly_id = int(results[0][0])
                        logger.info(
                            'found new anomaly id to process from the DB - %s'
                            % str(process_anomaly_id))
                        # Handle the first one
                        last_processed_anomaly_id = process_anomaly_id - 1
                    else:
                        logger.info('no new anomalies in the anomalies table')

                # @added 20190517 - Bug #3016: Handle no anomaly ids in luminosity
                #                   Branch #3002: docker
                # When Skyline is first installed, if luminosity is enabled it
                # reports errors as there are no anomaly ids
                if str(last_processed_anomaly_id) == 'None':
                    last_processed_anomaly_id = 0

                query = 'SELECT * FROM anomalies WHERE id > \'%s\'' % str(
                    last_processed_anomaly_id)  # nosec
                results = None
                try:
                    results = mysql_select(skyline_app, query)
                except:
                    logger.error('error :: MySQL quey failed - %s' % query)
                if results:
                    try:
                        process_anomaly_id = int(results[0][0])
                        logger.info(
                            'found the next new anomaly id to process from the DB - %s'
                            % str(process_anomaly_id))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error('error :: from query - %s' % query)
                else:
                    logger.info('no new anomalies in the anomalies table')

                if process_anomaly_id and last_processed_anomaly_id:
                    if isinstance(last_processed_anomaly_id, int):
                        if isinstance(process_anomaly_id, int):
                            if last_processed_anomaly_id == process_anomaly_id:
                                logger.info(
                                    'anomaly id already processed - %s' %
                                    str(process_anomaly_id))
                                process_anomaly_id = None

                if not process_anomaly_id:
                    logger.info(
                        'sleeping 20 no anomalies to correlate - last processed anomaly id - %s'
                        % str(last_processed_anomaly_id))
                    sleep(20)
                    up_now = time()
                    # Report app up
                    try:
                        self.redis_conn.setex(skyline_app, 120, up_now)
                        logger.info('updated Redis key for %s up' %
                                    skyline_app)
                    except:
                        logger.error(
                            'error :: failed to update Redis key for %s up' %
                            skyline_app)

                cache_key = '%s.sent_graphite_metrics' % skyline_app
                redis_sent_graphite_metrics = False
                try:
                    redis_sent_graphite_metrics = self.redis_conn.get(
                        cache_key)
                except Exception as e:
                    logger.error(
                        'error :: could not query Redis for key %s: %s' %
                        (cache_key, e))

                # Flush metrics to Graphite
                if not redis_sent_graphite_metrics:
                    try:
                        # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage
                        # correlations = str(len(self.correlations))
                        # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings
                        #                      Branch #3262: py3
                        # correlations = str(len(list(self.redis_conn.smembers('luminosity.correlations'))))
                        correlations = str(
                            len(
                                list(
                                    self.redis_conn_decoded.smembers(
                                        'luminosity.correlations'))))
                    except:
                        correlations = '0'
                    logger.info('correlations       :: %s' % correlations)
                    send_metric_name = '%s.correlations' % skyline_app_graphite_namespace
                    send_graphite_metric(skyline_app, send_metric_name,
                                         correlations)

                    # @added 20190522 - Task #3034: Reduce multiprocessing Manager list usage
                    try:
                        # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings
                        #                      Branch #3262: py3
                        # runtimes = list(self.redis_conn.smembers('luminosity.runtimes'))
                        runtimes = list(
                            self.redis_conn_decoded.smembers(
                                'luminosity.runtimes'))
                    except:
                        runtimes = []

                    # @added 20180720 - Task #2462: Implement useful metrics for Luminosity
                    #                   Branch #2270: luminosity
                    # runtime metric to monitor the time it takes to process
                    # correlations
                    try:
                        # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage
                        # if len(self.runtimes) > 1:
                        #     avg_runtime = sum(self.runtimes) / len(self.runtimes)
                        # else:
                        #     avg_runtime = sum(self.runtimes)
                        if len(runtimes) > 1:
                            avg_runtime = sum(runtimes) / len(runtimes)
                        else:
                            avg_runtime = sum(runtimes)
                    except:
                        avg_runtime = '0'
                    logger.info('avg_runtime       :: %s' % str(avg_runtime))
                    send_metric_name = '%s.avg_runtime' % skyline_app_graphite_namespace
                    send_graphite_metric(skyline_app, send_metric_name,
                                         str(avg_runtime))
                    try:
                        # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage
                        # metrics_checked_for_correlation = str(sum(self.metrics_checked_for_correlation))
                        # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings
                        #                      Branch #3262: py3
                        # metrics_checked_for_correlation = str(len(list(self.redis_conn.smembers('luminosity.metrics_checked_for_correlation'))))
                        metrics_checked_for_correlation = str(
                            len(
                                list(
                                    self.redis_conn_decoded.smembers(
                                        'luminosity.metrics_checked_for_correlation'
                                    ))))
                    except:
                        metrics_checked_for_correlation = '0'
                    logger.info('metrics_checked_for_correlation   :: %s' %
                                metrics_checked_for_correlation)
                    send_metric_name = '%s.metrics_checked_for_correlation' % skyline_app_graphite_namespace
                    send_graphite_metric(skyline_app, send_metric_name,
                                         metrics_checked_for_correlation)
                    sent_graphite_metrics_now = int(time())
                    try:
                        self.redis_conn.setex(cache_key, 59,
                                              sent_graphite_metrics_now)
                        logger.info('updated Redis key - %s' % cache_key)
                    except:
                        logger.error(
                            'error :: failed to update Redis key - %s up' %
                            cache_key)

                    # Reset lists
                    # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage
                    # self.correlations[:] = []
                    # @added 20180720 - Task #2462: Implement useful metrics for Luminosity
                    # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage
                    # self.runtimes[:] = []
                    # self.metrics_checked_for_correlation[:] = []

                    # @added 20190522 - Task #3034: Reduce multiprocessing Manager list usage
                    # Use Redis sets instead of Manager().list()
                    delete_redis_sets = [
                        'luminosity.correlations', 'luminosity.runtimes',
                        'luminosity.metrics_checked_for_correlation'
                    ]
                    for i_redis_set in delete_redis_sets:
                        redis_set_to_delete = i_redis_set
                        try:
                            self.redis_conn.delete(redis_set_to_delete)
                            logger.info('deleted Redis set - %s' %
                                        redis_set_to_delete)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: failed to delete Redis set - %s' %
                                redis_set_to_delete)

                # @added 20180720 - Task #2462: Implement useful metrics for Luminosity
                #                   Feature #2464: luminosity_remote_data
                # Added the ability to add a Redis key to overview the memcached
                # key luminosity.last.processed.anomaly.id some it does not have
                # to be changed via telnet to memcache.
                if not process_anomaly_id or not redis_sent_graphite_metrics:
                    cache_key = '%s.last.processed.anomaly.id' % skyline_app
                    redis_last_processed_anomaly_id_redis_key = False
                    try:
                        redis_last_processed_anomaly_id_redis_key = self.redis_conn.get(
                            cache_key)
                    except Exception as e:
                        logger.error(
                            'error :: could not query Redis for key %s: %s' %
                            (cache_key, e))
                    if redis_last_processed_anomaly_id_redis_key:
                        logger.info(
                            'found Redis %s key to override the mecache key setting process_anomaly_id to %s'
                            % (cache_key,
                               str(redis_last_processed_anomaly_id_redis_key)))
                        try:
                            process_anomaly_id = int(
                                redis_last_processed_anomaly_id_redis_key)
                        except:
                            logger.error(traceback.format_exc())
                            logger.error(
                                'error :: failed to set process_anomaly_id from Rdis override key value'
                            )
                        # And remove the Redis override key as it is only meant
                        # to override once to allow for a replay for debug
                        # purposes only.
                        try:
                            self.redis_conn.setex(
                                cache_key, 1,
                                int(redis_last_processed_anomaly_id_redis_key))
                            logger.info('updated Redis key - %s' % cache_key)
                        except:
                            logger.error(
                                'error :: failed to update Redis key - %s up to 1 second expiring to delete it.'
                                % cache_key)

                if process_anomaly_id:
                    break

            # Spawn process
            logger.info('spawning processes to correlate anomaly id %s' %
                        str(process_anomaly_id))
            pids = []
            spawned_pids = []
            pid_count = 0
            now = time()
            for i in range(1, LUMINOSITY_PROCESSES + 1):
                try:
                    p = Process(target=self.spin_process,
                                args=(i, process_anomaly_id))
                    pids.append(p)
                    pid_count += 1
                    logger.info('starting %s of %s spin_process/es' %
                                (str(pid_count), str(LUMINOSITY_PROCESSES)))
                    p.start()
                    spawned_pids.append(p.pid)
                except:
                    logger.error(traceback.format_exc())
                    logger.error('error :: failed to start spin_process')
                    continue

            # Self monitor processes and terminate if any spin_process has run
            # for to long
            p_starts = time()
            while time() - p_starts <= 60:
                if any(p.is_alive() for p in pids):
                    # Just to avoid hogging the CPU
                    sleep(.1)
                else:
                    # All the processes are done, break now.
                    time_to_run = time() - p_starts
                    logger.info('%s spin_process completed in %.2f seconds' %
                                (str(LUMINOSITY_PROCESSES), time_to_run))
                    break
            else:
                # We only enter this if we didn't 'break' above.
                logger.info('timed out, killing all spin_process processes')
                for p in pids:
                    try:
                        p.terminate()
                        # p.join()
                        logger.info('killed spin_process process')
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: killing all spin_process processes')

            for p in pids:
                if p.is_alive():
                    logger.info('stopping spin_process - %s' %
                                (str(p.is_alive())))
                    p.join()

            process_runtime = time() - now
            if process_runtime < 10:
                sleep_for = (10 - process_runtime)
                logger.info(
                    'sleeping for %.2f seconds due to low run time...' %
                    sleep_for)
                sleep(sleep_for)
                try:
                    del sleep_for
                except:
                    logger.error('error :: failed to del sleep_for')
            try:
                del process_runtime
            except:
                logger.error('error :: failed to del process_runtime')
Exemple #12
0
    def run(self):
        """
        Called when the process intializes.
        """

        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        if os.path.isfile(skyline_app_logwait):
            try:
                os.remove(skyline_app_logwait)
            except OSError:
                logger.error('error :: failed to remove %s, continuing' % skyline_app_logwait)
                pass

        now = time()
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = time()
            else:
                now = log_wait_for + 1

        logger.info('starting %s run' % skyline_app)
        if os.path.isfile(skyline_app_loglock):
            logger.error('error :: bin/%s.d log management seems to have failed, continuing' % skyline_app)
            try:
                os.remove(skyline_app_loglock)
                logger.info('log lock file removed')
            except OSError:
                logger.error('error :: failed to remove %s, continuing' % skyline_app_loglock)
                pass
        else:
            logger.info('bin/%s.d log management done' % skyline_app)

        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except:
                logger.error('error :: skyline cannot connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH)
                sleep(10)
                self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                continue

            # Report app up
            self.redis_conn.setex(skyline_app, 120, now)

            # Discover unique metrics
            unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics'))

            if len(unique_metrics) == 0:
                logger.info('no metrics in redis. try adding some - see README')
                sleep(10)
                continue

            # Reset boundary_metrics
            boundary_metrics = []

            # Build boundary metrics
            for metric_name in unique_metrics:
                for metric in BOUNDARY_METRICS:
                    CHECK_MATCH_PATTERN = metric[0]
                    check_match_pattern = re.compile(CHECK_MATCH_PATTERN)
                    base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1)
                    pattern_match = check_match_pattern.match(base_name)
                    if pattern_match:
                        if ENABLE_BOUNDARY_DEBUG:
                            logger.info('debug :: boundary metric - pattern MATCHED - ' + metric[0] + " | " + base_name)
                        boundary_metrics.append([metric_name, metric[1]])

            if ENABLE_BOUNDARY_DEBUG:
                logger.info('debug :: boundary metrics - ' + str(boundary_metrics))

            if len(boundary_metrics) == 0:
                logger.info('no Boundary metrics in redis. try adding some - see README')
                sleep(10)
                continue

            # Spawn processes
            pids = []
            for i in range(1, settings.BOUNDARY_PROCESSES + 1):
                if i > len(boundary_metrics):
                    logger.info('WARNING: Skyline Boundary is set for more cores than needed.')
                    break

                p = Process(target=self.spin_process, args=(i, boundary_metrics))
                pids.append(p)
                p.start()

            # Send wait signal to zombie processes
            for p in pids:
                p.join()

            # Grab data from the queue and populate dictionaries
            exceptions = dict()
            anomaly_breakdown = dict()
            while 1:
                try:
                    key, value = self.anomaly_breakdown_q.get_nowait()
                    if key not in anomaly_breakdown.keys():
                        anomaly_breakdown[key] = value
                    else:
                        anomaly_breakdown[key] += value
                except Empty:
                    break

            while 1:
                try:
                    key, value = self.exceptions_q.get_nowait()
                    if key not in exceptions.keys():
                        exceptions[key] = value
                    else:
                        exceptions[key] += value
                except Empty:
                    break

            # Send alerts
            if settings.BOUNDARY_ENABLE_ALERTS:
                for anomalous_metric in self.anomalous_metrics:
                    datapoint = str(anomalous_metric[0])
                    metric_name = anomalous_metric[1]
                    base_name = metric_name.replace(FULL_NAMESPACE, '', 1)
                    expiration_time = str(anomalous_metric[2])
                    metric_trigger = str(anomalous_metric[5])
                    alert_threshold = int(anomalous_metric[6])
                    metric_alerters = anomalous_metric[7]
                    algorithm = anomalous_metric[8]
                    if ENABLE_BOUNDARY_DEBUG:
                        logger.info("debug :: anomalous_metric - " + str(anomalous_metric))

                    # Determine how many times has the anomaly been seen if the
                    # ALERT_THRESHOLD is set to > 1 and create a cache key in
                    # redis to keep count so that alert_threshold can be honored
                    if alert_threshold == 0:
                        times_seen = 1
                        if ENABLE_BOUNDARY_DEBUG:
                            logger.info("debug :: alert_threshold - " + str(alert_threshold))

                    if alert_threshold == 1:
                        times_seen = 1
                        if ENABLE_BOUNDARY_DEBUG:
                            logger.info("debug :: alert_threshold - " + str(alert_threshold))

                    if alert_threshold > 1:
                        if ENABLE_BOUNDARY_DEBUG:
                            logger.info('debug :: alert_threshold - ' + str(alert_threshold))
                        anomaly_cache_key_count_set = False
                        anomaly_cache_key_expiration_time = (int(alert_threshold) + 1) * 60
                        anomaly_cache_key = 'anomaly_seen.%s.%s' % (algorithm, base_name)
                        try:
                            anomaly_cache_key_count = self.redis_conn.get(anomaly_cache_key)
                            if not anomaly_cache_key_count:
                                try:
                                    if ENABLE_BOUNDARY_DEBUG:
                                        logger.info('debug :: redis no anomaly_cache_key - ' + str(anomaly_cache_key))
                                    times_seen = 1
                                    if ENABLE_BOUNDARY_DEBUG:
                                        logger.info('debug :: redis setex anomaly_cache_key - ' + str(anomaly_cache_key))
                                    self.redis_conn.setex(anomaly_cache_key, anomaly_cache_key_expiration_time, packb(int(times_seen)))
                                    logger.info('set anomaly seen key :: %s seen %s' % (anomaly_cache_key, str(times_seen)))
                                except Exception as e:
                                    logger.error('error :: redis setex failed :: %s' % str(anomaly_cache_key))
                                    logger.error('error :: could not set key: %s' % e)
                            else:
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info('debug :: redis anomaly_cache_key retrieved OK - ' + str(anomaly_cache_key))
                                anomaly_cache_key_count_set = True
                        except:
                            if ENABLE_BOUNDARY_DEBUG:
                                logger.info('debug :: redis failed - anomaly_cache_key retrieval failed - ' + str(anomaly_cache_key))
                            anomaly_cache_key_count_set = False

                        if anomaly_cache_key_count_set:
                            unpacker = Unpacker(use_list=False)
                            unpacker.feed(anomaly_cache_key_count)
                            raw_times_seen = list(unpacker)
                            times_seen = int(raw_times_seen[0]) + 1
                            try:
                                self.redis_conn.setex(anomaly_cache_key, anomaly_cache_key_expiration_time, packb(int(times_seen)))
                                logger.info('error :: set anomaly seen key :: %s seen %s' % (anomaly_cache_key, str(times_seen)))
                            except:
                                times_seen = 1
                                logger.error('error :: set anomaly seen key failed :: %s seen %s' % (anomaly_cache_key, str(times_seen)))

                    # Alert the alerters if times_seen > alert_threshold
                    if times_seen >= alert_threshold:
                        if ENABLE_BOUNDARY_DEBUG:
                            logger.info('debug :: times_seen %s is greater than or equal to alert_threshold %s' % (str(times_seen), str(alert_threshold)))
                        for alerter in metric_alerters.split("|"):
                            # Determine alerter limits
                            send_alert = False
                            alerts_sent = 0
                            if ENABLE_BOUNDARY_DEBUG:
                                logger.info('debug :: checking alerter - %s' % alerter)
                            try:
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info('debug :: determining alerter_expiration_time for settings')
                                alerter_expiration_time_setting = settings.BOUNDARY_ALERTER_OPTS['alerter_expiration_time'][alerter]
                                alerter_expiration_time = int(alerter_expiration_time_setting)
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info('debug :: determined alerter_expiration_time from settings - %s' % str(alerter_expiration_time))
                            except:
                                # Set an arbitrary expiry time if not set
                                alerter_expiration_time = 160
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info("debug :: could not determine alerter_expiration_time from settings")
                            try:
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info("debug :: determining alerter_limit from settings")
                                alerter_limit_setting = settings.BOUNDARY_ALERTER_OPTS['alerter_limit'][alerter]
                                alerter_limit = int(alerter_limit_setting)
                                alerter_limit_set = True
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info("debug :: determined alerter_limit from settings - %s" % str(alerter_limit))
                            except:
                                alerter_limit_set = False
                                send_alert = True
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info("debug :: could not determine alerter_limit from settings")

                            # If the alerter_limit is set determine how many
                            # alerts the alerter has sent
                            if alerter_limit_set:
                                alerter_sent_count_key = 'alerts_sent.%s' % (alerter)
                                try:
                                    alerter_sent_count_key_data = self.redis_conn.get(alerter_sent_count_key)
                                    if not alerter_sent_count_key_data:
                                        if ENABLE_BOUNDARY_DEBUG:
                                            logger.info("debug :: redis no alerter key, no alerts sent for - " + str(alerter_sent_count_key))
                                        alerts_sent = 0
                                        send_alert = True
                                        if ENABLE_BOUNDARY_DEBUG:
                                            logger.info("debug :: alerts_sent set to %s" % str(alerts_sent))
                                            logger.info("debug :: send_alert set to %s" % str(sent_alert))
                                    else:
                                        if ENABLE_BOUNDARY_DEBUG:
                                            logger.info("debug :: redis alerter key retrieved, unpacking" + str(alerter_sent_count_key))
                                        unpacker = Unpacker(use_list=False)
                                        unpacker.feed(alerter_sent_count_key_data)
                                        raw_alerts_sent = list(unpacker)
                                        alerts_sent = int(raw_alerts_sent[0])
                                        if ENABLE_BOUNDARY_DEBUG:
                                            logger.info("debug :: alerter %s alerts sent %s " % (str(alerter), str(alerts_sent)))
                                except:
                                    logger.info("No key set - %s" % alerter_sent_count_key)
                                    alerts_sent = 0
                                    send_alert = True
                                    if ENABLE_BOUNDARY_DEBUG:
                                        logger.info("debug :: alerts_sent set to %s" % str(alerts_sent))
                                        logger.info("debug :: send_alert set to %s" % str(send_alert))

                                if alerts_sent < alerter_limit:
                                    send_alert = True
                                    if ENABLE_BOUNDARY_DEBUG:
                                        logger.info("debug :: alerts_sent %s is less than alerter_limit %s" % (str(alerts_sent), str(alerter_limit)))
                                        logger.info("debug :: send_alert set to %s" % str(send_alert))

                            # Send alert
                            alerter_alert_sent = False
                            if send_alert:
                                cache_key = 'last_alert.boundary.%s.%s.%s' % (alerter, base_name, algorithm)
                                if ENABLE_BOUNDARY_DEBUG:
                                    logger.info("debug :: checking cache_key - %s" % cache_key)
                                try:
                                    last_alert = self.redis_conn.get(cache_key)
                                    if not last_alert:
                                        try:
                                            self.redis_conn.setex(cache_key, int(anomalous_metric[2]), packb(int(anomalous_metric[0])))
                                            if ENABLE_BOUNDARY_DEBUG:
                                                logger.info('debug :: key setex OK - %s' % (cache_key))
                                            trigger_alert(alerter, datapoint, base_name, expiration_time, metric_trigger, algorithm)
                                            logger.info('alert sent :: %s - %s - via %s - %s' % (base_name, datapoint, alerter, algorithm))
                                            trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm)
                                            logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm))
                                            alerter_alert_sent = True
                                        except Exception as e:
                                            logger.error('error :: alert failed :: %s - %s - via %s - %s' % (base_name, datapoint, alerter, algorithm))
                                            logger.error('error :: could not send alert: %s' % str(e))
                                            trigger_alert('syslog', datapoint, base_name, expiration_time, metric_trigger, algorithm)
                                    else:
                                        if ENABLE_BOUNDARY_DEBUG:
                                            logger.info("debug :: cache_key exists not alerting via %s for %s is less than alerter_limit %s" % (alerter, cache_key))
                                        trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm)
                                        logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm))
                                except:
                                    trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm)
                                    logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm))
                            else:
                                trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm)
                                logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm))

                            # Update the alerts sent for the alerter cache key,
                            # to allow for alert limiting
                            if alerter_alert_sent and alerter_limit_set:
                                try:
                                    alerter_sent_count_key = 'alerts_sent.%s' % (alerter)
                                    new_alerts_sent = int(alerts_sent) + 1
                                    self.redis_conn.setex(alerter_sent_count_key, alerter_expiration_time, packb(int(new_alerts_sent)))
                                    logger.info('set %s - %s' % (alerter_sent_count_key, str(new_alerts_sent)))
                                except:
                                    logger.error('error :: failed to set %s - %s' % (alerter_sent_count_key, str(new_alerts_sent)))
                    else:
                        # Always alert to syslog, even if alert_threshold is not
                        # breached or if send_alert is not True
                        trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm)
                        logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm))

            # Write anomalous_metrics to static webapp directory
            if len(self.anomalous_metrics) > 0:
                filename = path.abspath(path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP))
                with open(filename, 'w') as fh:
                    # Make it JSONP with a handle_data() function
                    anomalous_metrics = list(self.anomalous_metrics)
                    anomalous_metrics.sort(key=operator.itemgetter(1))
                    fh.write('handle_data(%s)' % anomalous_metrics)

            run_time = time() - now
            total_metrics = str(len(boundary_metrics))
            total_analyzed = str(len(boundary_metrics) - sum(exceptions.values()))
            total_anomalies = str(len(self.anomalous_metrics))

            # Log progress
            logger.info('seconds to run    :: %.2f' % run_time)
            logger.info('total metrics     :: %s' % total_metrics)
            logger.info('total analyzed    :: %s' % total_analyzed)
            logger.info('total anomalies   :: %s' % total_anomalies)
            logger.info('exception stats   :: %s' % exceptions)
            logger.info('anomaly breakdown :: %s' % anomaly_breakdown)

            # Log to Graphite
            graphite_run_time = '%.2f' % run_time
            send_metric_name = skyline_app_graphite_namespace + '.run_time'
            send_graphite_metric(skyline_app, send_metric_name, graphite_run_time)

            send_metric_name = skyline_app_graphite_namespace + '.total_analyzed'
            send_graphite_metric(skyline_app, send_metric_name, total_analyzed)

            send_metric_name = skyline_app_graphite_namespace + '.total_anomalies'
            send_graphite_metric(skyline_app, send_metric_name, total_anomalies)

            send_metric_name = skyline_app_graphite_namespace + '.total_metrics'
            send_graphite_metric(skyline_app, send_metric_name, total_metrics)
            for key, value in exceptions.items():
                send_metric_name = '%s.exceptions.%s' % (skyline_app_graphite_namespace, key)
                send_graphite_metric(skyline_app, send_metric_name, str(value))
            for key, value in anomaly_breakdown.items():
                send_metric_name = '%s.anomaly_breakdown.%s' % (skyline_app_graphite_namespace, key)
                send_graphite_metric(skyline_app, send_metric_name, str(value))

            # Check canary metric
            raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC)
            if raw_series is not None:
                unpacker = Unpacker(use_list=False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)
                time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600
                projected = 24 * (time() - now) / time_human

                logger.info('canary duration   :: %.2f' % time_human)
                send_metric_name = skyline_app_graphite_namespace + '.duration'
                send_graphite_metric(skyline_app, send_metric_name, str(time_human))

                send_metric_name = skyline_app_graphite_namespace + '.projected'
                send_graphite_metric(skyline_app, send_metric_name, str(projected))

            # Reset counters
            self.anomalous_metrics[:] = []

            # Only run once per
            process_runtime = time() - now
            try:
                boundary_optimum_run_duration = settings.BOUNDARY_OPTIMUM_RUN_DURATION
            except:
                boundary_optimum_run_duration = 60

            if process_runtime < boundary_optimum_run_duration:
                sleep_for = (boundary_optimum_run_duration - process_runtime)
                logger.info('sleeping %.2f for seconds' % sleep_for)
                sleep(sleep_for)
Exemple #13
0
    def run(self):
        """
        Called when the process intializes.
        """
        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        if os.path.isfile(skyline_app_logwait):
            try:
                os_remove(skyline_app_logwait)
            except OSError:
                logger.error('error - failed to remove %s, continuing' %
                             skyline_app_logwait)
                pass

        now = time()
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = time()
            else:
                now = log_wait_for + 1

        logger.info('starting %s run' % skyline_app)
        if os.path.isfile(skyline_app_loglock):
            logger.error(
                'error - bin/%s.d log management seems to have failed, continuing'
                % skyline_app)
            try:
                os_remove(skyline_app_loglock)
                logger.info('log lock file removed')
            except OSError:
                logger.error('error - failed to remove %s, continuing' %
                             skyline_app_loglock)
                pass
        else:
            logger.info('bin/%s.d log management done' % skyline_app)

        logger.info('%s :: started worker' % skyline_app)

        FULL_NAMESPACE = settings.FULL_NAMESPACE
        MINI_NAMESPACE = settings.MINI_NAMESPACE
        MAX_RESOLUTION = settings.MAX_RESOLUTION
        full_uniques = '%sunique_metrics' % FULL_NAMESPACE
        mini_uniques = '%sunique_metrics' % MINI_NAMESPACE
        pipe = self.redis_conn.pipeline()

        last_send_to_graphite = time()
        queue_sizes = []

        # python-2.x and python3.x had while 1 and while True differently
        # while 1:
        running = True
        while running:

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except:
                logger.error(
                    '%s :: can\'t connect to redis at socket path %s' %
                    (skyline_app, settings.REDIS_SOCKET_PATH))
                sleep(10)
                self.redis_conn = StrictRedis(
                    unix_socket_path=settings.REDIS_SOCKET_PATH)
                pipe = self.redis_conn.pipeline()
                continue

            try:
                # Get a chunk from the queue with a 15 second timeout
                chunk = self.q.get(True, 15)
                now = time()

                for metric in chunk:

                    # Check if we should skip it
                    if self.in_skip_list(metric[0]):
                        continue

                    # Bad data coming in
                    if metric[1][0] < now - MAX_RESOLUTION:
                        continue

                    # Append to messagepack main namespace
                    key = ''.join((FULL_NAMESPACE, metric[0]))
                    pipe.append(key, packb(metric[1]))
                    pipe.sadd(full_uniques, key)

                    if not self.skip_mini:
                        # Append to mini namespace
                        mini_key = ''.join((MINI_NAMESPACE, metric[0]))
                        pipe.append(mini_key, packb(metric[1]))
                        pipe.sadd(mini_uniques, mini_key)

                    pipe.execute()

            except Empty:
                logger.info('%s :: worker queue is empty and timed out' %
                            skyline_app)
            except WatchError:
                logger.error('%s :: WatchError - %s' % (skyline_app, str(key)))
            except NotImplementedError:
                pass
            except Exception as e:
                logger.error('%s :: error: %s' % (skyline_app, str(e)))

            # Log progress
            if self.canary:
                logger.info('%s :: queue size at %d' %
                            (skyline_app, self.q.qsize()))
                queue_sizes.append(self.q.qsize())
                # Only send average queue mertics to graphite once per 10 seconds
                now = time()
                last_sent_graphite = now - last_send_to_graphite
                if last_sent_graphite > 10:
                    number_queue_sizes = len(queue_sizes)
                    total_of_queue_sizes = sum(queue_sizes)
                    if total_of_queue_sizes > 0:
                        average_queue_size = total_of_queue_sizes / number_queue_sizes
                    else:
                        average_queue_size = 0
                    logger.info(
                        '%s :: total queue size for the last 10 seconds - %s' %
                        (skyline_app, str(total_of_queue_sizes)))
                    logger.info(
                        '%s :: total queue values known for the last 10 seconds - %s'
                        % (skyline_app, str(number_queue_sizes)))
                    logger.info(
                        '%s :: average queue size for the last 10 seconds - %s'
                        % (skyline_app, str(average_queue_size)))
                    # self.send_graphite_metric('skyline.horizon.' + SERVER_METRIC_PATH + 'queue_size', self.q.qsize())
                    #                    self.send_graphite_metric('queue_size', average_queue_size)
                    send_metric_name = '%s.queue_size' % skyline_app_graphite_namespace
                    send_graphite_metric(skyline_app, send_metric_name,
                                         average_queue_size)

                    # reset queue_sizes and last_sent_graphite
                    queue_sizes = []
                    last_send_to_graphite = time()
Exemple #14
0
    def run(self):
        """
        Called when the process intializes.
        """

        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        if os.path.isfile(skyline_app_logwait):
            try:
                logger.info('removing %s' % skyline_app_logwait)
                os.remove(skyline_app_logwait)
            except OSError:
                logger.error('error :: failed to remove %s, continuing' %
                             skyline_app_logwait)
                pass

        now = time()
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = time()
            else:
                now = log_wait_for + 1

        logger.info('starting %s run' % skyline_app)
        if os.path.isfile(skyline_app_loglock):
            logger.error(
                'error :: bin/%s.d log management seems to have failed, continuing'
                % skyline_app)
            try:
                os.remove(skyline_app_loglock)
                logger.info('log lock file removed')
            except OSError:
                logger.error('error :: failed to remove %s, continuing' %
                             skyline_app_loglock)
                pass
        else:
            logger.info('bin/%s.d log management done' % skyline_app)

        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
                if ENABLE_LUMINOSITY_DEBUG:
                    logger.info('debug :: connected to Redis')
            except:
                logger.error(
                    'error :: cannot connect to redis at socket path %s' %
                    (settings.REDIS_SOCKET_PATH))
                sleep(30)
                # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow
                if settings.REDIS_PASSWORD:
                    self.redis_conn = StrictRedis(
                        password=settings.REDIS_PASSWORD,
                        unix_socket_path=settings.REDIS_SOCKET_PATH)
                else:
                    self.redis_conn = StrictRedis(
                        unix_socket_path=settings.REDIS_SOCKET_PATH)
                continue

            # Report app up
            try:
                self.redis_conn.setex(skyline_app, 120, now)
                logger.info('updated Redis key for %s up' % skyline_app)
            except:
                logger.error('error :: failed to update Redis key for %s up' %
                             skyline_app)
            """
            Determine if any new anomalies have been added
            """
            while True:
                process_anomaly_id = None
                last_processed_anomaly_id = None
                memcache_last_processed_anomaly_id_data = False
                # Check memcached before MySQL
                memcache_key = '%s.last.processed.anomaly.id' % skyline_app
                if settings.MEMCACHE_ENABLED:
                    try:
                        last_processed_anomaly_id = self.memcache_client.get(
                            memcache_key)
                        # if memcache does not have the key the response to the
                        # client is None, it does not except
                    except:
                        logger.error(
                            'error :: failed to get %s from memcache' %
                            memcache_key)
                    try:
                        self.memcache_client.close()
                    except:
                        logger.error(
                            'error :: failed to close memcache_client')

                if last_processed_anomaly_id:
                    logger.info(
                        'last_processed_anomaly_id found in memcache - %s' %
                        str(last_processed_anomaly_id))
                    memcache_last_processed_anomaly_id_data = True
                else:
                    logger.info(
                        'last_processed_anomaly_id key was NOT found in memcache - %s'
                        % str(last_processed_anomaly_id))

                if not last_processed_anomaly_id:
                    query = 'SELECT id FROM luminosity WHERE id=(SELECT MAX(id) FROM luminosity) ORDER BY id DESC LIMIT 1'
                    results = None
                    try:
                        results = mysql_select(skyline_app, query)
                    except:
                        logger.error(traceback.format_exc())
                        logger.error('error :: MySQL quey failed - %s' % query)
                    if results:
                        try:
                            last_processed_anomaly_id = int(results[0][0])
                            logger.info(
                                'last_processed_anomaly_id found from DB - %s'
                                % str(last_processed_anomaly_id))
                        except:
                            logger.error(traceback.format_exc())

                        if last_processed_anomaly_id and settings.MEMCACHE_ENABLED:
                            if not memcache_last_processed_anomaly_id_data:
                                logger.info(
                                    'Populating memcache with DB result - %s' %
                                    str(last_processed_anomaly_id))
                                try:
                                    self.memcache_client.set(
                                        memcache_key,
                                        int(last_processed_anomaly_id))
                                    logger.info(
                                        'populated memcache key %s with %s' %
                                        (memcache_key,
                                         str(last_processed_anomaly_id)))
                                except:
                                    logger.error(
                                        'error :: failed to set  the memcache key - %s - %s'
                                        % (memcache_key,
                                           str(last_processed_anomaly_id)))
                                try:
                                    self.memcache_client.close()
                                except:
                                    logger.error(
                                        'error :: failed to close memcache_client'
                                    )

                if not last_processed_anomaly_id:
                    # Check MySQL
                    now = int(time())
                    after = now - 600
                    query = 'SELECT * FROM anomalies WHERE anomaly_timestamp > \'%s\'' % str(
                        after)  # nosec
                    results = None
                    try:
                        results = mysql_select(skyline_app, query)
                    except:
                        logger.error('error :: MySQL quey failed - %s' % query)
                    if results:
                        process_anomaly_id = int(results[0][0])
                        logger.info(
                            'found new anomaly id to process from the DB - %s'
                            % str(process_anomaly_id))
                        # Handle the first one
                        last_processed_anomaly_id = process_anomaly_id - 1
                    else:
                        logger.info('no new anomalies in the anomalies table')

                query = 'SELECT * FROM anomalies WHERE id > \'%s\'' % str(
                    last_processed_anomaly_id)  # nosec
                results = None
                try:
                    results = mysql_select(skyline_app, query)
                except:
                    logger.error('error :: MySQL quey failed - %s' % query)
                if results:
                    try:
                        process_anomaly_id = int(results[0][0])
                        logger.info(
                            'found the next new anomaly id to process from the DB - %s'
                            % str(process_anomaly_id))
                    except:
                        logger.error(traceback.format_exc())
                        logger.error('error :: from query - %s' % query)
                else:
                    logger.info('no new anomalies in the anomalies table')

                if process_anomaly_id and last_processed_anomaly_id:
                    if isinstance(last_processed_anomaly_id, int):
                        if isinstance(process_anomaly_id, int):
                            if last_processed_anomaly_id == process_anomaly_id:
                                logger.info(
                                    'anomaly id already processed - %s' %
                                    str(process_anomaly_id))
                                process_anomaly_id = None

                if not process_anomaly_id:
                    logger.info(
                        'sleeping 20 no anomalies to correlate - last processed anomaly id - %s'
                        % str(last_processed_anomaly_id))
                    sleep(20)
                    up_now = time()
                    # Report app up
                    try:
                        self.redis_conn.setex(skyline_app, 120, up_now)
                        logger.info('updated Redis key for %s up' %
                                    skyline_app)
                    except:
                        logger.error(
                            'error :: failed to update Redis key for %s up' %
                            skyline_app)

                cache_key = '%s.sent_graphite_metrics' % skyline_app
                redis_sent_graphite_metrics = False
                try:
                    redis_sent_graphite_metrics = self.redis_conn.get(
                        cache_key)
                except Exception as e:
                    logger.error(
                        'error :: could not query Redis for key %s: %s' %
                        (cache_key, e))

                # Flush metrics to Graphite
                if not redis_sent_graphite_metrics:
                    try:
                        total_anomalies = str(len(self.anomalous_metrics))
                    except:
                        total_anomalies = '0'
                    logger.info('total_anomalies    :: %s' % total_anomalies)
                    send_metric_name = '%s.total_anomalies' % skyline_app_graphite_namespace
                    send_graphite_metric(skyline_app, send_metric_name,
                                         total_anomalies)

                    try:
                        correlations = str(len(self.correlations))
                    except:
                        correlations = '0'
                    logger.info('correlations       :: %s' % correlations)
                    send_metric_name = '%s.correlations' % skyline_app_graphite_namespace
                    send_graphite_metric(skyline_app, send_metric_name,
                                         correlations)

                    sent_graphite_metrics_now = int(time())
                    try:
                        self.redis_conn.setex(cache_key, 59,
                                              sent_graphite_metrics_now)
                        logger.info('updated Redis key - %s' % cache_key)
                    except:
                        logger.error(
                            'error :: failed to update Redis key - %s up' %
                            cache_key)

                    # Reset lists
                    self.anomalous_metrics[:] = []
                    self.correlations[:] = []

                if process_anomaly_id:
                    break

            # Spawn process
            logger.info('spawning processes to correlate anomaly id %s' %
                        str(process_anomaly_id))
            pids = []
            spawned_pids = []
            pid_count = 0
            now = time()
            for i in range(1, luminosity_processes + 1):
                try:
                    p = Process(target=self.spin_process,
                                args=(i, process_anomaly_id))
                    pids.append(p)
                    pid_count += 1
                    logger.info('starting %s of %s spin_process/es' %
                                (str(pid_count), str(luminosity_processes)))
                    p.start()
                    spawned_pids.append(p.pid)
                except:
                    logger.error(traceback.format_exc())
                    logger.error('error :: failed to start spin_process')
                    continue

            # Self monitor processes and terminate if any spin_process has run
            # for to long
            p_starts = time()
            while time() - p_starts <= 60:
                if any(p.is_alive() for p in pids):
                    # Just to avoid hogging the CPU
                    sleep(.1)
                else:
                    # All the processes are done, break now.
                    time_to_run = time() - p_starts
                    logger.info('%s spin_process completed in %.2f seconds' %
                                (str(luminosity_processes), time_to_run))
                    break
            else:
                # We only enter this if we didn't 'break' above.
                logger.info('timed out, killing all spin_process processes')
                for p in pids:
                    try:
                        p.terminate()
                        # p.join()
                        logger.info('killed spin_process process')
                    except:
                        logger.error(traceback.format_exc())
                        logger.error(
                            'error :: killing all spin_process processes')

            for p in pids:
                if p.is_alive():
                    logger.info('stopping spin_process - %s' %
                                (str(p.is_alive())))
                    p.join()

            process_runtime = time() - now
            if process_runtime < 10:
                sleep_for = (10 - process_runtime)
                logger.info(
                    'sleeping for %.2f seconds due to low run time...' %
                    sleep_for)
                sleep(sleep_for)
                try:
                    del sleep_for
                except:
                    logger.error('error :: failed to del sleep_for')
            try:
                del process_runtime
            except:
                logger.error('error :: failed to del process_runtime')
Exemple #15
0
    def submit_PrometheusMetrics(self):
        """
        Get Prometheus metrics from the Redis flux.prometheus_metrics hash key
        add submit them to the Horizon worker queue
        """
        last_hash_check = int(time())
        last_log = int(last_hash_check)
        submitted_metrics = 0

        running = True
        while running:
            self.check_if_parent_is_alive()
            now_ts = int(time())

            if now_ts >= (last_log + 59):
                logger.info(
                    'prometheus :: submit_PrometheusMetrics - submitted %s metrics to worker in the last 60 seconds'
                    % str(submitted_metrics))
                submitted_metrics = 0
                last_log = int(now_ts)

            if (now_ts - last_hash_check) < 1:
                last_hash_check = int(now_ts)
                continue
            last_hash_check = int(now_ts)
            try:
                prometheus_metrics = {}
                try:
                    prometheus_metrics = self.redis_conn_decoded.hgetall(
                        'flux.prometheus_metrics')
                except Exception as err:
                    logger.error(
                        'error :: prometheus :: submit_PrometheusMetrics could not determine prometheus_metrics from flux.prometheus_metrics Redis hash - %s'
                        % str(err))
                    prometheus_metrics = {}
                if not prometheus_metrics:
                    continue

                chunk = []
                metrics_added = 0
                prometheus_metrics_list = list(prometheus_metrics.keys())
                for prometheus_metric in prometheus_metrics_list:
                    if metrics_added >= HORIZON_PROMETHEUS_METRIC_LIMIT:
                        break
                    try:
                        metric_data_str = prometheus_metrics[prometheus_metric]
                        # The data is stored by flux in the Redis hash key as
                        # 'metric': 'timestamp,value' to reduce the overhead of
                        # needing to use literal_eval on every call, instead a
                        # simple str split is used
                        timestamp = metric_data_str.split(',')[0]
                        value = metric_data_str.split(',')[1]
                        chunk.append([
                            prometheus_metric, [int(timestamp),
                                                float(value)]
                        ])
                        metrics_added += 1
                        submitted_metrics += 1
                        self.redis_conn_decoded.hdel('flux.prometheus_metrics',
                                                     prometheus_metric)
                    except Exception as err:
                        logger.error(
                            'error :: prometheus :: submit_PrometheusMetrics could not determine prometheus_metrics from flux.prometheus_metrics Redis hash - %s'
                            % str(err))
                    if len(chunk) > settings.CHUNK_SIZE:
                        break
                if chunk:
                    try:
                        self.q.put(list(chunk), block=False)
                        chunk[:] = []
                    # Drop chunk if queue is full
                    except Full:
                        chunks_dropped = str(len(chunk))
                        logger.info(
                            '%s :: prometheus :: submit_PrometheusMetrics :: worker queue is full, dropping %s Prometheus metric datapoints'
                            % (skyline_app, chunks_dropped))
                        send_metric_name = '%s.prometheus_chunks_dropped' % skyline_app_graphite_namespace
                        send_graphite_metric(skyline_app, send_metric_name,
                                             chunks_dropped)
                        chunk[:] = []

            except Exception as err:
                logger.info(
                    '%s :: prometheus :: submit_PrometheusMetrics :: error adding check to queue - %s'
                    % (skyline_app, str(err)))
Exemple #16
0
    def run(self):
        """
        Called when the process intializes.
        """
        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        if os.path.isfile(skyline_app_logwait):
            try:
                os_remove(skyline_app_logwait)
            except OSError:
                logger.error('error - failed to remove %s, continuing' %
                             skyline_app_logwait)
                pass

        now = time()
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = time()
            else:
                now = log_wait_for + 1

        logger.info('starting %s run' % skyline_app)
        if os.path.isfile(skyline_app_loglock):
            logger.error(
                'error - bin/%s.d log management seems to have failed, continuing'
                % skyline_app)
            try:
                os_remove(skyline_app_loglock)
                logger.info('log lock file removed')
            except OSError:
                logger.error('error - failed to remove %s, continuing' %
                             skyline_app_loglock)
                pass
        else:
            logger.info('bin/%s.d log management done' % skyline_app)

        logger.info('%s :: started worker' % skyline_app)

        FULL_NAMESPACE = settings.FULL_NAMESPACE
        MINI_NAMESPACE = settings.MINI_NAMESPACE
        MAX_RESOLUTION = settings.MAX_RESOLUTION
        full_uniques = '%sunique_metrics' % FULL_NAMESPACE
        mini_uniques = '%sunique_metrics' % MINI_NAMESPACE
        pipe = self.redis_conn.pipeline()

        last_send_to_graphite = time()
        queue_sizes = []

        # python-2.x and python3.x handle while 1 and while True differently
        # while 1:
        running = True
        while running:

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except:
                logger.error(
                    '%s :: cannot connect to Redis at socket path %s' %
                    (skyline_app, settings.REDIS_SOCKET_PATH))
                sleep(10)
                # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow
                if settings.REDIS_PASSWORD:
                    # @modified 20191014 - Bug #3266: py3 Redis binary objects not strings
                    #                      Branch #3262: py3
                    # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH)
                    self.redis_conn = StrictRedis(
                        password=settings.REDIS_PASSWORD,
                        unix_socket_path=settings.REDIS_SOCKET_PATH,
                        charset='utf-8',
                        decode_responses=True)
                else:
                    # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                    self.redis_conn = StrictRedis(
                        unix_socket_path=settings.REDIS_SOCKET_PATH,
                        charset='utf-8',
                        decode_responses=True)
                pipe = self.redis_conn.pipeline()
                continue

            try:
                # Get a chunk from the queue with a 15 second timeout
                chunk = self.q.get(True, 15)
                # @modified 20170317 - Feature #1978: worker - DO_NOT_SKIP_LIST
                # now = time()
                now = int(time())

                for metric in chunk:

                    # Check if we should skip it
                    if self.in_skip_list(metric[0]):
                        continue

                    # Bad data coming in
                    # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7
                    #                      Branch #3262: py3
                    #                      Bug #3266: py3 Redis binary objects not strings
                    # if metric[1][0] < now - MAX_RESOLUTION:
                    #     continue
                    try:
                        if int(metric[1][0]) < now - MAX_RESOLUTION:
                            continue
                    except:
                        pass

                    # Append to messagepack main namespace
                    # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7
                    #                      Branch #3262: py3
                    #                      Bug #3266: py3 Redis binary objects not strings
                    # key = ''.join((FULL_NAMESPACE, metric[0]))
                    key = ''.join((FULL_NAMESPACE, str(metric[0])))
                    if LOCAL_DEBUG:
                        logger.info(
                            'debug :: worker :: adding metric Redis key - %s' %
                            str(key))

                    # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7
                    #                      Branch #3262: py3
                    #                      Bug #3266: py3 Redis binary objects not strings
                    # pipe.append(key, packb(metric[1]))
                    # pipe.sadd(full_uniques, key)
                    try:
                        pipe.append(str(key), packb(metric[1]))
                    except Exception as e:
                        logger.error('%s :: error on pipe.append: %s' %
                                     (skyline_app, str(e)))
                    try:
                        # pipe.sadd(full_uniques, key)
                        pipe.sadd(full_uniques, str(key))
                    except Exception as e:
                        logger.error('%s :: error on pipe.sadd: %s' %
                                     (skyline_app, str(e)))

                    if not self.skip_mini:
                        # Append to mini namespace
                        # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7
                        #                      Branch #3262: py3
                        #                      Bug #3266: py3 Redis binary objects not strings
                        # mini_key = ''.join((MINI_NAMESPACE, metric[0]))
                        mini_key = ''.join((MINI_NAMESPACE, str(metric[0])))
                        pipe.append(mini_key, packb(metric[1]))
                        pipe.sadd(mini_uniques, mini_key)

                    # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7
                    #                      Branch #3262: py3
                    #                      Bug #3266: py3 Redis binary objects not strings
                    # pipe.execute()
                    try:
                        pipe.execute()
                    except Exception as e:
                        logger.error('%s :: error on pipe.execute: %s' %
                                     (skyline_app, str(e)))
            except Empty:
                logger.info('%s :: worker queue is empty and timed out' %
                            skyline_app)
            except WatchError:
                logger.error('%s :: WatchError - %s' % (skyline_app, str(key)))
            except NotImplementedError:
                pass
            except Exception as e:
                # @added 20190130 - Task #2690: Test Skyline on Python-3.6.7
                #                   Branch #3262: py3
                #                   Bug #3266: py3 Redis binary objects not strings
                # Added traceback
                logger.error(traceback.format_exc())
                logger.error('%s :: error: %s' % (skyline_app, str(e)))

            # Log progress
            if self.canary:
                logger.info('%s :: queue size at %d' %
                            (skyline_app, self.q.qsize()))
                queue_sizes.append(self.q.qsize())
                # Only send average queue mertics to graphite once per 10 seconds
                now = time()
                last_sent_graphite = now - last_send_to_graphite
                if last_sent_graphite > 10:
                    number_queue_sizes = len(queue_sizes)
                    total_of_queue_sizes = sum(queue_sizes)
                    if total_of_queue_sizes > 0:
                        average_queue_size = total_of_queue_sizes / number_queue_sizes
                    else:
                        average_queue_size = 0
                    logger.info(
                        '%s :: total queue size for the last 10 seconds - %s' %
                        (skyline_app, str(total_of_queue_sizes)))
                    logger.info(
                        '%s :: total queue values known for the last 10 seconds - %s'
                        % (skyline_app, str(number_queue_sizes)))
                    logger.info(
                        '%s :: average queue size for the last 10 seconds - %s'
                        % (skyline_app, str(average_queue_size)))
                    # self.send_graphite_metric('skyline.horizon.' + SERVER_METRIC_PATH + 'queue_size', self.q.qsize())
                    # self.send_graphite_metric('queue_size', average_queue_size)
                    send_metric_name = '%s.queue_size' % skyline_app_graphite_namespace
                    send_graphite_metric(skyline_app, send_metric_name,
                                         average_queue_size)

                    # reset queue_sizes and last_sent_graphite
                    queue_sizes = []
                    last_send_to_graphite = time()
Exemple #17
0
    def run(self):
        """
        Called when the process intializes.
        """
        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        if os.path.isfile(skyline_app_logwait):
            try:
                os_remove(skyline_app_logwait)
            except OSError:
                logger.error('error - failed to remove %s, continuing' % skyline_app_logwait)
                pass

        now = time()
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = time()
            else:
                now = log_wait_for + 1

        logger.info('starting %s run' % skyline_app)
        if os.path.isfile(skyline_app_loglock):
            logger.error('error - bin/%s.d log management seems to have failed, continuing' % skyline_app)
            try:
                os_remove(skyline_app_loglock)
                logger.info('log lock file removed')
            except OSError:
                logger.error('error - failed to remove %s, continuing' % skyline_app_loglock)
                pass
        else:
            logger.info('bin/%s.d log management done' % skyline_app)

        logger.info('%s :: started worker' % skyline_app)

        FULL_NAMESPACE = settings.FULL_NAMESPACE
        MINI_NAMESPACE = settings.MINI_NAMESPACE
        MAX_RESOLUTION = settings.MAX_RESOLUTION
        full_uniques = '%sunique_metrics' % FULL_NAMESPACE
        mini_uniques = '%sunique_metrics' % MINI_NAMESPACE
        pipe = self.redis_conn.pipeline()

        last_send_to_graphite = time()
        queue_sizes = []

        # python-2.x and python3.x had while 1 and while True differently
        # while 1:
        running = True
        while running:

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except:
                logger.error('%s :: can\'t connect to redis at socket path %s' % (skyline_app, settings.REDIS_SOCKET_PATH))
                sleep(10)
                self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                pipe = self.redis_conn.pipeline()
                continue

            try:
                # Get a chunk from the queue with a 15 second timeout
                chunk = self.q.get(True, 15)
                now = time()

                for metric in chunk:

                    # Check if we should skip it
                    if self.in_skip_list(metric[0]):
                        continue

                    # Bad data coming in
                    if metric[1][0] < now - MAX_RESOLUTION:
                        continue

                    # Append to messagepack main namespace
                    key = ''.join((FULL_NAMESPACE, metric[0]))
                    pipe.append(key, packb(metric[1]))
                    pipe.sadd(full_uniques, key)

                    if not self.skip_mini:
                        # Append to mini namespace
                        mini_key = ''.join((MINI_NAMESPACE, metric[0]))
                        pipe.append(mini_key, packb(metric[1]))
                        pipe.sadd(mini_uniques, mini_key)

                    pipe.execute()

            except Empty:
                logger.info('%s :: worker queue is empty and timed out' % skyline_app)
            except WatchError:
                logger.error('%s :: WatchError - %s' % (skyline_app, str(key)))
            except NotImplementedError:
                pass
            except Exception as e:
                logger.error('%s :: error: %s' % (skyline_app, str(e)))

            # Log progress
            if self.canary:
                logger.info('%s :: queue size at %d' % (skyline_app, self.q.qsize()))
                queue_sizes.append(self.q.qsize())
                # Only send average queue mertics to graphite once per 10 seconds
                now = time()
                last_sent_graphite = now - last_send_to_graphite
                if last_sent_graphite > 10:
                    number_queue_sizes = len(queue_sizes)
                    total_of_queue_sizes = sum(queue_sizes)
                    if total_of_queue_sizes > 0:
                        average_queue_size = total_of_queue_sizes / number_queue_sizes
                    else:
                        average_queue_size = 0
                    logger.info('%s :: total queue size for the last 10 seconds - %s' % (skyline_app, str(total_of_queue_sizes)))
                    logger.info('%s :: total queue values known for the last 10 seconds - %s' % (skyline_app, str(number_queue_sizes)))
                    logger.info('%s :: average queue size for the last 10 seconds - %s' % (skyline_app, str(average_queue_size)))
                    # self.send_graphite_metric('skyline.horizon.' + SERVER_METRIC_PATH + 'queue_size', self.q.qsize())
#                    self.send_graphite_metric('queue_size', average_queue_size)
                    send_metric_name = '%s.queue_size' % skyline_app_graphite_namespace
                    send_graphite_metric(skyline_app, send_metric_name, average_queue_size)

                    # reset queue_sizes and last_sent_graphite
                    queue_sizes = []
                    last_send_to_graphite = time()