def listen_udp(self): """ Listen over udp for MessagePack strings """ while 1: try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.bind((self.ip, self.port)) logger.info('%s :: listening over udp for messagepack on %s' % (skyline_app, self.port)) chunk = [] while 1: self.check_if_parent_is_alive() data, addr = s.recvfrom(1024) # @modified 20191014 - Task #3272: horizon - listen - py3 handle msgpack bytes # Branch #3262: py3 # Bug #3266: py3 Redis binary objects not strings # msgpack encoding of bytes and not str as per # https://msgpack.org/#string-and-binary-type Python/msgpack # and https://stackoverflow.com/a/47070687/107406 # metric = unpackb(data) if python_version == 3: metric = unpackb(data, encoding='utf-8') else: metric = unpackb(data) chunk.append(metric) # Queue the chunk and empty the variable if len(chunk) > settings.CHUNK_SIZE: try: self.q.put(list(chunk), block=False) chunk[:] = [] # Drop chunk if queue is full except Full: chunks_dropped = str(len(chunk)) logger.info( '%s :: UDP queue is full, dropping %s datapoints' % (skyline_app, chunks_dropped)) send_metric_name = '%s.udp_chunks_dropped' % skyline_app_graphite_namespace send_graphite_metric(skyline_app, send_metric_name, chunks_dropped) chunk[:] = [] except Exception as e: logger.info('%s :: cannot connect to socket: %s' % (skyline_app, str(e))) break
def listen_udp(self): """ Listen over udp for MessagePack strings """ while 1: try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.bind((self.ip, self.port)) logger.info('%s :: listening over udp for messagepack on %s' % (skyline_app, self.port)) chunk = [] while 1: self.check_if_parent_is_alive() data, addr = s.recvfrom(1024) metric = unpackb(data) chunk.append(metric) # Queue the chunk and empty the variable if len(chunk) > settings.CHUNK_SIZE: try: self.q.put(list(chunk), block=False) chunk[:] = [] # Drop chunk if queue is full except Full: chunks_dropped = str(len(chunk)) logger.info( '%s :: UDP queue is full, dropping %s datapoints' % (skyline_app, chunks_dropped)) send_metric_name = '%s.udp_chunks_dropped' % skyline_app_graphite_namespace send_graphite_metric(skyline_app, send_metric_name, chunks_dropped) chunk[:] = [] except Exception as e: logger.info('%s :: cannot connect to socket: %s' % (skyline_app, str(e))) break
def run(self): """ - Called when the process intializes. - Determine if Redis is up and discover the number of `unique metrics`. - Divide the `unique_metrics` between the number of `ANALYZER_PROCESSES` and assign each process a set of metrics to analyse for anomalies. - Wait for the processes to finish. - Determine whether if any anomalous metrics require: - Alerting on (and set `EXPIRATION_TIME` key in Redis for alert). - Feed to another module e.g. mirage. - Alert to syslog. - Populate the webapp json with the anomalous_metrics details. - Log the details about the run to the skyline analyzer log. - Send skyline.analyzer metrics to `GRAPHITE_HOST` """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: os.remove(skyline_app_logwait) except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error('error - bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os.remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) if not os.path.exists(settings.SKYLINE_TMP_DIR): if python_version == 2: mode_arg = int('0755') if python_version == 3: mode_arg = mode=0o755 os.makedirs(settings.SKYLINE_TMP_DIR, mode_arg) # Initiate the algorithm timings if Analyzer is configured to send the # algorithm_breakdown metrics with ENABLE_ALGORITHM_RUN_METRICS algorithm_tmp_file_prefix = settings.SKYLINE_TMP_DIR + '/' + skyline_app + '.' algorithms_to_time = [] if send_algorithm_run_metrics: algorithms_to_time = settings.ALGORITHMS while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.error('skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) continue # Report app up self.redis_conn.setex(skyline_app, 120, now) # Discover unique metrics unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) if len(unique_metrics) == 0: logger.info('no metrics in redis. try adding some - see README') sleep(10) continue # Using count files rather that multiprocessing.Value to enable metrics for # metrics for algorithm run times, etc for algorithm in algorithms_to_time: algorithm_count_file = algorithm_tmp_file_prefix + algorithm + '.count' algorithm_timings_file = algorithm_tmp_file_prefix + algorithm + '.timings' # with open(algorithm_count_file, 'a') as f: with open(algorithm_count_file, 'w') as f: pass with open(algorithm_timings_file, 'w') as f: pass # Remove any existing algorithm.error files from any previous runs # that did not cleanup for any reason pattern = '%s.*.algorithm.error' % skyline_app try: for f in os.listdir(settings.SKYLINE_TMP_DIR): if re.search(pattern, f): try: os.remove(os.path.join(settings.SKYLINE_TMP_DIR, f)) logger.info('cleaning up old error file - %s' % (str(f))) except OSError: pass except: logger.error('failed to cleanup algorithm.error files ' + traceback.format_exc()) # Spawn processes pids = [] spawned_pids = [] pid_count = 0 for i in range(1, settings.ANALYZER_PROCESSES + 1): if i > len(unique_metrics): logger.info('WARNING: skyline is set for more cores than needed.') break p = Process(target=self.spin_process, args=(i, unique_metrics)) pids.append(p) pid_count += 1 logger.info('starting %s of %s spin_process/es' % (str(pid_count), str(settings.ANALYZER_PROCESSES))) p.start() spawned_pids.append(p.pid) # Send wait signal to zombie processes # for p in pids: # p.join() # Self monitor processes and terminate if any spin_process has run # for longer than 180 seconds - 20160512 @earthgecko p_starts = time() while time() - p_starts <= settings.MAX_ANALYZER_PROCESS_RUNTIME: if any(p.is_alive() for p in pids): # Just to avoid hogging the CPU sleep(.1) else: # All the processes are done, break now. time_to_run = time() - p_starts logger.info('%s :: %s spin_process/es completed in %.2f seconds' % (skyline_app, str(settings.ANALYZER_PROCESSES), time_to_run)) break else: # We only enter this if we didn't 'break' above. logger.info('%s :: timed out, killing all spin_process processes' % (skyline_app)) for p in pids: p.terminate() p.join() # Log the last reported error by any algorithms that errored in the # spawned processes from algorithms.py for completed_pid in spawned_pids: logger.info('spin_process with pid %s completed' % (str(completed_pid))) for algorithm in settings.ALGORITHMS: algorithm_error_file = '%s/%s.%s.%s.algorithm.error' % ( settings.SKYLINE_TMP_DIR, skyline_app, str(completed_pid), algorithm) if os.path.isfile(algorithm_error_file): logger.info( 'error - spin_process with pid %s has reported an error with the %s algorithm' % ( str(completed_pid), algorithm)) try: with open(algorithm_error_file, 'r') as f: error_string = f.read() logger.error('%s' % str(error_string)) except: logger.error('failed to read %s error file' % algorithm) try: os.remove(algorithm_error_file) except OSError: pass # Grab data from the queue and populate dictionaries exceptions = dict() anomaly_breakdown = dict() while 1: try: key, value = self.anomaly_breakdown_q.get_nowait() if key not in anomaly_breakdown.keys(): anomaly_breakdown[key] = value else: anomaly_breakdown[key] += value except Empty: break while 1: try: key, value = self.exceptions_q.get_nowait() if key not in exceptions.keys(): exceptions[key] = value else: exceptions[key] += value except Empty: break # Send alerts if settings.ENABLE_ALERTS: for alert in settings.ALERTS: for metric in self.anomalous_metrics: ALERT_MATCH_PATTERN = alert[0] METRIC_PATTERN = metric[1] alert_match_pattern = re.compile(ALERT_MATCH_PATTERN) pattern_match = alert_match_pattern.match(METRIC_PATTERN) if pattern_match: cache_key = 'last_alert.%s.%s' % (alert[1], metric[1]) try: last_alert = self.redis_conn.get(cache_key) if not last_alert: try: SECOND_ORDER_RESOLUTION_FULL_DURATION = alert[3] logger.info('mirage check :: %s' % (metric[1])) # Write anomalous metric to test at second # order resolution by crucible to the check # file metric_timestamp = int(time()) anomaly_check_file = '%s/%s.%s.txt' % (settings.MIRAGE_CHECK_PATH, metric_timestamp, metric[1]) with open(anomaly_check_file, 'w') as fh: # metric_name, anomalous datapoint, hours to resolve, timestamp fh.write('metric = "%s"\nvalue = "%s"\nhours_to_resolve = "%s"\nmetric_timestamp = "%s"\n' % (metric[1], metric[0], alert[3], metric_timestamp)) if python_version == 2: mode_arg = int('0644') if python_version == 3: mode_arg = '0o644' os.chmod(anomaly_check_file, mode_arg) logger.info('added mirage check :: %s,%s,%s' % (metric[1], metric[0], alert[3])) # Add to the mirage_metrics list base_name = METRIC_PATTERN.replace(settings.FULL_NAMESPACE, '', 1) metric = [metric[0], base_name] self.mirage_metrics.append(metric) # Alert for analyzer if enabled if settings.ENABLE_FULL_DURATION_ALERTS: self.redis_conn.setex(cache_key, alert[2], packb(metric[0])) trigger_alert(alert, metric) except: self.redis_conn.setex(cache_key, alert[2], packb(metric[0])) trigger_alert(alert, metric) except Exception as e: logger.error('error :: could not send alert: %s' % e) # Push to crucible # if len(self.crucible_anomalous_metrics) > 0: # logger.info('to do - push to crucible') # Write anomalous_metrics to static webapp directory if len(self.anomalous_metrics) > 0: filename = path.abspath(path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP)) with open(filename, 'w') as fh: # Make it JSONP with a handle_data() function anomalous_metrics = list(self.anomalous_metrics) anomalous_metrics.sort(key=operator.itemgetter(1)) fh.write('handle_data(%s)' % anomalous_metrics) # Using count files rather that multiprocessing.Value to enable metrics for # metrics for algorithm run times, etc for algorithm in algorithms_to_time: algorithm_count_file = algorithm_tmp_file_prefix + algorithm + '.count' algorithm_timings_file = algorithm_tmp_file_prefix + algorithm + '.timings' try: algorithm_count_array = [] with open(algorithm_count_file, 'r') as f: for line in f: value_string = line.replace('\n', '') unquoted_value_string = value_string.replace("'", '') float_value = float(unquoted_value_string) algorithm_count_array.append(float_value) except: algorithm_count_array = False if not algorithm_count_array: continue number_of_times_algorithm_run = len(algorithm_count_array) logger.info( 'algorithm run count - %s run %s times' % ( algorithm, str(number_of_times_algorithm_run))) if number_of_times_algorithm_run == 0: continue try: algorithm_timings_array = [] with open(algorithm_timings_file, 'r') as f: for line in f: value_string = line.replace('\n', '') unquoted_value_string = value_string.replace("'", '') float_value = float(unquoted_value_string) algorithm_timings_array.append(float_value) except: algorithm_timings_array = False if not algorithm_timings_array: continue number_of_algorithm_timings = len(algorithm_timings_array) logger.info( 'algorithm timings count - %s has %s timings' % ( algorithm, str(number_of_algorithm_timings))) if number_of_algorithm_timings == 0: continue try: _sum_of_algorithm_timings = sum(algorithm_timings_array) except: logger.error("sum error: " + traceback.format_exc()) _sum_of_algorithm_timings = round(0.0, 6) logger.error('error - sum_of_algorithm_timings - %s' % (algorithm)) continue sum_of_algorithm_timings = round(_sum_of_algorithm_timings, 6) # logger.info('sum_of_algorithm_timings - %s - %.16f seconds' % (algorithm, sum_of_algorithm_timings)) try: _median_algorithm_timing = determine_median(algorithm_timings_array) except: _median_algorithm_timing = round(0.0, 6) logger.error('error - _median_algorithm_timing - %s' % (algorithm)) continue median_algorithm_timing = round(_median_algorithm_timing, 6) # logger.info('median_algorithm_timing - %s - %.16f seconds' % (algorithm, median_algorithm_timing)) logger.info( 'algorithm timing - %s - total: %.6f - median: %.6f' % ( algorithm, sum_of_algorithm_timings, median_algorithm_timing)) use_namespace = skyline_app_graphite_namespace + '.algorithm_breakdown.' + algorithm send_metric_name = use_namespace + '.timing.times_run' send_graphite_metric(skyline_app, send_metric_name, str(number_of_algorithm_timings)) send_metric_name = use_namespace + '.timing.total_time' send_graphite_metric(skyline_app, send_metric_name, str(sum_of_algorithm_timings)) send_metric_name = use_namespace + '.timing.median_time' send_graphite_metric(skyline_app, send_metric_name, str(median_algorithm_timing)) run_time = time() - now total_metrics = str(len(unique_metrics)) total_analyzed = str(len(unique_metrics) - sum(exceptions.values())) total_anomalies = str(len(self.anomalous_metrics)) # Log progress logger.info('seconds to run :: %.2f' % run_time) logger.info('total metrics :: %s' % total_metrics) logger.info('total analyzed :: %s' % total_analyzed) logger.info('total anomalies :: %s' % total_anomalies) logger.info('exception stats :: %s' % exceptions) logger.info('anomaly breakdown :: %s' % anomaly_breakdown) # Log to Graphite graphite_run_time = '%.2f' % run_time send_metric_name = skyline_app_graphite_namespace + '.run_time' send_graphite_metric(skyline_app, send_metric_name, graphite_run_time) send_metric_name = skyline_app_graphite_namespace + '.total_analyzed' send_graphite_metric(skyline_app, send_metric_name, total_analyzed) send_metric_name = skyline_app_graphite_namespace + '.total_anomalies' send_graphite_metric(skyline_app, send_metric_name, total_anomalies) send_metric_name = skyline_app_graphite_namespace + '.total_metrics' send_graphite_metric(skyline_app, send_metric_name, total_metrics) for key, value in exceptions.items(): send_metric_name = '%s.exceptions.%s' % (skyline_app_graphite_namespace, key) send_graphite_metric(skyline_app, send_metric_name, str(value)) for key, value in anomaly_breakdown.items(): send_metric_name = '%s.anomaly_breakdown.%s' % (skyline_app_graphite_namespace, key) send_graphite_metric(skyline_app, send_metric_name, str(value)) # Check canary metric raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC) if raw_series is not None: unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600 projected = 24 * (time() - now) / time_human logger.info('canary duration :: %.2f' % time_human) send_metric_name = skyline_app_graphite_namespace + '.duration' send_graphite_metric(skyline_app, send_metric_name, str(time_human)) send_metric_name = skyline_app_graphite_namespace + '.projected' send_graphite_metric(skyline_app, send_metric_name, str(projected)) # Reset counters self.anomalous_metrics[:] = [] # Sleep if it went too fast # if time() - now < 5: # logger.info('sleeping due to low run time...') # sleep(10) # @modified 20160504 - @earthgecko - development internal ref #1338, #1340) # Etsy's original for this was a value of 5 seconds which does # not make skyline Analyzer very efficient in terms of installations # where 100s of 1000s of metrics are being analyzed. This lead to # Analyzer running over several metrics multiple time in a minute # and always working. Therefore this was changed from if you took # less than 5 seconds to run only then sleep. This behaviour # resulted in Analyzer analysing a few 1000 metrics in 9 seconds and # then doing it again and again in a single minute. Therefore the # ANALYZER_OPTIMUM_RUN_DURATION setting was added to allow this to # self optimise in cases where skyline is NOT deployed to analyze # 100s of 1000s of metrics. This relates to optimising performance # for any deployments in the few 1000s and 60 second resolution # area, e.g. smaller and local deployments. process_runtime = time() - now analyzer_optimum_run_duration = settings.ANALYZER_OPTIMUM_RUN_DURATION if process_runtime < analyzer_optimum_run_duration: sleep_for = (analyzer_optimum_run_duration - process_runtime) logger.info('sleeping for %.2f seconds due to low run time...' % sleep_for) sleep(sleep_for)
def run(self): """ Called when the process intializes. """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log # @modified 20201017 - Feature #3788: snab_flux_load_test # Feature #3680: horizon.worker.datapoints_sent_to_redis # Only do log management with the canary worker if self.canary: if os.path.isfile(skyline_app_logwait): try: os_remove(skyline_app_logwait) except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s worker number %s run' % (skyline_app, self.worker_number)) # @modified 20201017 - Feature #3788: snab_flux_load_test # Feature #3680: horizon.worker.datapoints_sent_to_redis if self.canary: if os.path.isfile(skyline_app_loglock): logger.error( 'error - bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os_remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) logger.info('%s :: started worker %s' % (skyline_app, str(self.worker_number))) # @added 20220216 - Feature #4446: Optimise horizon worker in_skip_list skip_metrics_dict = {} do_not_skip_metrics_dict = {} skip_metrics_list = [] do_not_skip_metrics_list = [] checked_for_skip_count = 0 # @added 20201103 - Feature #3820: HORIZON_SHARDS if HORIZON_SHARDS: logger.info( '%s :: HORIZON_SHARDS declared, this horizon instance is assigned shard %s' % (skyline_app, str(HORIZON_SHARD))) FULL_NAMESPACE = settings.FULL_NAMESPACE MINI_NAMESPACE = settings.MINI_NAMESPACE MAX_RESOLUTION = settings.MAX_RESOLUTION full_uniques = '%sunique_metrics' % FULL_NAMESPACE mini_uniques = '%sunique_metrics' % MINI_NAMESPACE pipe = self.redis_conn.pipeline() last_send_to_graphite = time() queue_sizes = [] # @added 20210416 - Task #4020: Add horizon.queue_size_60s_avg metric last_log_queue_size = time() queue_sizes_60_seconds = [] # @added 20200815 - Feature #3680: horizon.worker.datapoints_sent_to_redis datapoints_sent_to_redis = 0 last_datapoints_to_redis = int(time()) # @added 20201103 - Feature #3820: HORIZON_SHARDS if HORIZON_SHARDS: horizon_shard_assigned_metrics = [] horizon_shard_dropped_metrics = [] # @added 20201120 - Feature #3820: HORIZON_SHARDS metrics_received = [] # python-2.x and python3.x handle while 1 and while True differently # while 1: running = True while running: # Make sure Redis is up try: self.redis_conn.ping() except: logger.error( '%s :: cannot connect to Redis at socket path %s' % (skyline_app, settings.REDIS_SOCKET_PATH)) sleep(10) # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow if settings.REDIS_PASSWORD: # @modified 20191014 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) self.redis_conn = StrictRedis( password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH, charset='utf-8', decode_responses=True) else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) self.redis_conn = StrictRedis( unix_socket_path=settings.REDIS_SOCKET_PATH, charset='utf-8', decode_responses=True) pipe = self.redis_conn.pipeline() continue # @added 20220216 - Feature #4446: Optimise horizon worker in_skip_list if not skip_metrics_dict: try: skip_metrics_dict = self.redis_conn_decoded.hgetall( 'horizon.skip_metrics') logger.info( '%s :: got %s metrics from horizon.skip_metrics from Redis hash as not set' % (skyline_app, str(len(skip_metrics_dict)))) except Exception as err: logger.error( '%s :: error on hgetall horizon.skip_metrics: %s' % (skyline_app, str(err))) if not do_not_skip_metrics_dict: try: do_not_skip_metrics_dict = self.redis_conn_decoded.hgetall( 'horizon.do_not_skip_metrics') logger.info( '%s :: got %s metrics from horizon.do_not_skip_metrics from Redis hash as not set' % (skyline_app, str(len(do_not_skip_metrics_dict)))) except Exception as err: logger.error( '%s :: error on hgetall horizon.do_not_skip_metrics: %s' % (skyline_app, str(err))) try: # Get a chunk from the queue with a 15 second timeout chunk = self.q.get(True, 15) # @modified 20170317 - Feature #1978: worker - DO_NOT_SKIP_LIST # now = time() now = int(time()) for metric in chunk: # @added 20201120 - Feature #3820: HORIZON_SHARDS try: metrics_received.append(metric[0]) except: pass # @added 20201103 - Feature #3820: HORIZON_SHARDS # If a metric does not map to the HORIZON_SHARD, drop it # and continue if HORIZON_SHARDS and number_of_horizon_shards: if not self.in_shard(metric[0]): horizon_shard_dropped_metrics.append(metric[0]) continue else: horizon_shard_assigned_metrics.append(metric[0]) # @added 20220216 - Feature #4446: Optimise horizon worker in_skip_list found_in_do_not_skip_dict = False if do_not_skip_metrics_dict: try: found_in_do_not_skip_dict = do_not_skip_metrics_dict[ metric[0]] except KeyError: found_in_do_not_skip_dict = False if found_in_do_not_skip_dict: do_not_skip_metrics_list.append(metric[0]) found_in_skip_dict = False if not found_in_do_not_skip_dict and skip_metrics_dict: try: found_in_skip_dict = skip_metrics_dict[metric[0]] except KeyError: found_in_skip_dict = False if found_in_skip_dict: skip_metrics_list.append(metric[0]) continue # @modified 20220216 - Feature #4446: Optimise horizon worker in_skip_list if not found_in_do_not_skip_dict: checked_for_skip_count += 1 # Check if we should skip it if self.in_skip_list(metric[0]): # @added 20220216 - Feature #4446: Optimise horizon worker in_skip_list skip_metrics_list.append(metric[0]) continue # @added 20220216 - Feature #4446: Optimise horizon worker in_skip_list if not found_in_do_not_skip_dict: do_not_skip_metrics_list.append(metric[0]) # Bad data coming in # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7 # Branch #3262: py3 # Bug #3266: py3 Redis binary objects not strings # if metric[1][0] < now - MAX_RESOLUTION: # continue try: if int(metric[1][0]) < now - MAX_RESOLUTION: continue except: pass # Append to messagepack main namespace # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7 # Branch #3262: py3 # Bug #3266: py3 Redis binary objects not strings # key = ''.join((FULL_NAMESPACE, metric[0])) key = ''.join((FULL_NAMESPACE, str(metric[0]))) if LOCAL_DEBUG: logger.info( 'debug :: worker :: adding metric Redis key - %s' % str(key)) # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7 # Branch #3262: py3 # Bug #3266: py3 Redis binary objects not strings # pipe.append(key, packb(metric[1])) # pipe.sadd(full_uniques, key) try: pipe.append(str(key), packb(metric[1])) # @added 20200815 - Feature #3680: horizon.worker.datapoints_sent_to_redis datapoints_sent_to_redis += 1 except Exception as e: logger.error('%s :: error on pipe.append: %s' % (skyline_app, str(e))) try: # pipe.sadd(full_uniques, key) pipe.sadd(full_uniques, str(key)) except Exception as e: logger.error('%s :: error on pipe.sadd: %s' % (skyline_app, str(e))) if not self.skip_mini: # Append to mini namespace # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7 # Branch #3262: py3 # Bug #3266: py3 Redis binary objects not strings # mini_key = ''.join((MINI_NAMESPACE, metric[0])) mini_key = ''.join((MINI_NAMESPACE, str(metric[0]))) pipe.append(mini_key, packb(metric[1])) pipe.sadd(mini_uniques, mini_key) # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7 # Branch #3262: py3 # Bug #3266: py3 Redis binary objects not strings # pipe.execute() try: pipe.execute() except Exception as e: logger.error('%s :: error on pipe.execute: %s' % (skyline_app, str(e))) except Empty: logger.info('%s :: worker queue is empty and timed out' % skyline_app) except WatchError: logger.error('%s :: WatchError - %s' % (skyline_app, str(key))) except NotImplementedError: pass except Exception as e: # @added 20190130 - Task #2690: Test Skyline on Python-3.6.7 # Branch #3262: py3 # Bug #3266: py3 Redis binary objects not strings # Added traceback logger.error(traceback.format_exc()) logger.error('%s :: error: %s' % (skyline_app, str(e))) # Log progress if self.canary: # @modified 20210511 # Do not logg queue size for every chunk # logger.info('%s :: queue size at %d' % (skyline_app, self.q.qsize())) queue_sizes.append(self.q.qsize()) # Only send average queue mertics to graphite once per 10 seconds now = time() # @modified 20210416 - Task #4020: Add horizon.queue_size_60s_avg metric # last_sent_graphite = now - last_send_to_graphite # if last_sent_graphite > 10: last_logged_queue_size = now - last_log_queue_size if last_logged_queue_size > 10: number_queue_sizes = len(queue_sizes) total_of_queue_sizes = sum(queue_sizes) if total_of_queue_sizes > 0: average_queue_size = total_of_queue_sizes / number_queue_sizes else: average_queue_size = 0 logger.info( '%s :: total queue size for the last 10 seconds - %s' % (skyline_app, str(total_of_queue_sizes))) logger.info( '%s :: total queue values known for the last 10 seconds - %s' % (skyline_app, str(number_queue_sizes))) logger.info( '%s :: average queue size for the last 10 seconds - %s' % (skyline_app, str(average_queue_size))) # self.send_graphite_metric('skyline.horizon.' + SERVER_METRIC_PATH + 'queue_size', self.q.qsize()) # self.send_graphite_metric('queue_size', average_queue_size) send_metric_name = '%s.queue_size' % skyline_app_graphite_namespace send_graphite_metric(skyline_app, send_metric_name, average_queue_size) # @added 20210416 - Task #4020: Add horizon.queue_size_60s_avg metric for size_value in queue_sizes: queue_sizes_60_seconds.append(size_value) # reset queue_sizes queue_sizes = [] # last_send_to_graphite = time() last_log_queue_size = time() # @added 20210416 - Task #4020: Add horizon.queue_size_60s_avg metric # Moved from the 10 second block above to its own 60 seconds block last_sent_graphite = now - last_send_to_graphite if last_sent_graphite >= 60: try: number_queue_sizes = len(queue_sizes_60_seconds) total_of_queue_sizes = sum(queue_sizes_60_seconds) if total_of_queue_sizes > 0: average_queue_size = total_of_queue_sizes / number_queue_sizes else: average_queue_size = 0 logger.info( '%s :: total queue size for the last 60 seconds - %s' % (skyline_app, str(total_of_queue_sizes))) logger.info( '%s :: total queue values known for the last 60 seconds - %s' % (skyline_app, str(number_queue_sizes))) logger.info( '%s :: average queue size for the last 60 seconds - %s' % (skyline_app, str(average_queue_size))) send_metric_name = '%s.queue_size_60s_avg' % skyline_app_graphite_namespace send_graphite_metric(skyline_app, send_metric_name, average_queue_size) # reset queue_sizes_60_seconds and last_sent_graphite queue_sizes_60_seconds = [] last_send_to_graphite = time() except Exception as e: logger.error(traceback.format_exc()) logger.error('%s :: error: %s' % (skyline_app, str(e))) # @added 20210520 - Branch #1444: thunder # Added Redis hash keys for thunder to monitor the horizon # listen state, inferred via the # horizon.worker.average_queue_size, without incurring the # addition of Redis to horizon/listen. This is used by # Thunder notify on degradation of service and operational # changes. cache_key = 'horizon.worker.average_queue_size' try: self.redis_conn.hset(cache_key, 'value', int(average_queue_size)) self.redis_conn.hset(cache_key, 'timestamp', int(now)) except Exception as e: logger.error( 'error :: horizon :: worker :: could not update the Redis %s key - %s' % (cache_key, e)) # @added 20200815 - Feature #3680: horizon.worker.datapoints_sent_to_redis # @modified 20201017 - Feature #3788: snab_flux_load_test # Feature #3680: horizon.worker.datapoints_sent_to_redis # Send for each worker last_datapoints_count_to_redis = now - last_datapoints_to_redis if last_datapoints_count_to_redis >= 60: logger.info( '%s :: datapoints_sent_to_redis in last 60 seconds - %s' % (skyline_app, str(datapoints_sent_to_redis))) if self.canary: send_metric_name = '%s.datapoints_sent_to_redis' % ( skyline_app_graphite_namespace) else: send_metric_name = '%s.datapoints_sent_to_redis_%s' % ( skyline_app_graphite_namespace, str( self.worker_number)) send_graphite_metric(skyline_app, send_metric_name, datapoints_sent_to_redis) datapoints_sent_to_redis = 0 last_datapoints_to_redis = int(time()) # @added 20201120 - Feature #3820: HORIZON_SHARDS if HORIZON_SHARDS: try: self.redis_conn.sadd( 'horizon.shards.metrics_assigned', *set(horizon_shard_assigned_metrics)) except Exception as e: logger.error(traceback.format_exc()) logger.error( '%s :: error adding horizon.shard.metrics_assigned: %s' % (skyline_app, str(e))) try: self.redis_conn.sadd( 'horizon.shard.metrics_dropped', *set(horizon_shard_dropped_metrics)) except Exception as e: logger.error(traceback.format_exc()) logger.error( '%s :: error adding horizon.shard.metrics_dropped: %s' % (skyline_app, str(e))) # @added 20201103 - Feature #3820: HORIZON_SHARDS if HORIZON_SHARDS and HORIZON_SHARD_DEBUG: horizon_shard_assigned_metrics_count = len( horizon_shard_assigned_metrics) horizon_shard_dropped_metrics_count = len( horizon_shard_dropped_metrics) logger.info( '%s :: %s assigned metrics to HORIZON_SHARD %s and dropped %s metrics in last 60 seconds' % (skyline_app, str(horizon_shard_assigned_metrics_count), str(HORIZON_SHARD), str(horizon_shard_dropped_metrics_count))) if LOCAL_DEBUG: logger.info( '%s :: horizon_shard_assigned_metrics - %s' % (skyline_app, str(horizon_shard_assigned_metrics))) logger.info( '%s :: horizon_shard_dropped_metrics - %s' % (skyline_app, str(horizon_shard_dropped_metrics))) horizon_shard_assigned_metrics = [] horizon_shard_dropped_metrics = [] # @added 20201120 - Feature #3820: HORIZON_SHARDS try: self.redis_conn.sadd('horizon.metrics_received', *set(metrics_received)) except: logger.error(traceback.format_exc()) logger.error( '%s :: error adding horizon.metrics_received' % (skyline_app)) if self.canary: try: logger.info( 'renaming key horizon.metrics_received to aet.horizon.metrics_received' ) self.redis_conn.rename('horizon.metrics_received', 'aet.horizon.metrics_received') except Exception as e: logger.info(traceback.format_exc()) logger.error( 'error :: failed to rename Redis key horizon.metrics_received to aet.horizon.metrics_received - %s' % e) # @added 20210520 - Branch #1444: thunder # Added Redis hash keys for thunder to monitor the horizon # listen state, inferred via the # horizon.worker.average_queue_size, without incurring the # addition of Redis to horizon/listen. This is used by # Thunder notify on degradation of service and operational # changes. cache_key = 'horizon' try: self.redis_conn.setex(cache_key, 120, int(now)) except Exception as e: logger.error( 'error :: horizon :: worker :: could not update the Redis %s key - %s' % (cache_key, e)) metrics_received_count = 0 try: metrics_received_count = self.redis_conn.scard( 'aet.horizon.metrics_received') except Exception as e: logger.error(traceback.format_exc()) logger.error( '%s :: error running redis_conn.scard(\'aet.horizon.metrics_received\') - %s' % (skyline_app, e)) cache_key = 'thunder.horizon.worker.metrics_received' try: self.redis_conn.hset(cache_key, 'value', int(metrics_received_count)) self.redis_conn.hset(cache_key, 'timestamp', int(now)) except Exception as e: logger.error( 'error :: horizon :: worker :: could not update the Redis %s key - %s' % (cache_key, e)) send_metric_name = '%s.metrics_received' % ( skyline_app_graphite_namespace) send_graphite_metric(skyline_app, send_metric_name, metrics_received_count) metrics_received = [] # @added 20220216 - Feature #4446: Optimise horizon worker in_skip_list if skip_metrics_list: update_skip_metrics = {} for metric in list(set(skip_metrics_list)): update_entry = False if skip_metrics_dict: try: # Only update skip metrics every hour to save on # Redis overhead last_update_timestamp_str = skip_metrics_dict[ metric] if last_update_timestamp_str: last_update_timestamp = int( float(last_update_timestamp_str)) if (now - last_update_timestamp) >= 3600: update_entry = True except KeyError: update_entry = True except: update_entry = True else: update_entry = True if update_entry: update_skip_metrics[metric] = int(now) if update_skip_metrics: try: self.redis_conn_decoded.hset( 'horizon.skip_metrics', mapping=update_skip_metrics) except Exception as err: logger.error(traceback.format_exc()) logger.error( 'error :: listen :: failed to update horizon.skip_metrics Redis hash - %s' % (err)) if do_not_skip_metrics_list: update_metrics = {} for metric in list(set(do_not_skip_metrics_list)): update_entry = False if do_not_skip_metrics_dict: try: # Only update skip metrics every hour to save on # Redis overhead last_update_timestamp_str = skip_metrics_dict[ metric] if last_update_timestamp_str: last_update_timestamp = int( float(last_update_timestamp_str)) if (now - last_update_timestamp) >= 3600: update_entry = True except KeyError: update_entry = True except: update_entry = True else: update_entry = True if update_entry: update_metrics[metric] = int(now) if update_metrics: try: self.redis_conn_decoded.hset( 'horizon.do_not_skip_metrics', mapping=update_metrics) except Exception as err: logger.error(traceback.format_exc()) logger.error( 'error :: listen :: failed to update horizon.do_not_skip_metrics Redis hash - %s' % (err)) try: skip_metrics_dict = self.redis_conn_decoded.hgetall( 'horizon.skip_metrics') logger.info( '%s :: got %s metrics from horizon.skip_metrics from Redis hash on minutely run' % (skyline_app, str(len(skip_metrics_dict)))) except Exception as err: logger.error( '%s :: error on hgetall horizon.skip_metrics: %s' % (skyline_app, str(err))) try: do_not_skip_metrics_dict = self.redis_conn_decoded.hgetall( 'horizon.do_not_skip_metrics') logger.info( '%s :: got %s metrics from horizon.do_not_skip_metrics from Redis hash on minutely run' % (skyline_app, str(len(do_not_skip_metrics_dict)))) except Exception as err: logger.error( '%s :: error on hgetall horizon.do_not_skip_metrics: %s' % (skyline_app, str(err))) logger.info('%s :: checked %s metrics through in_skip_list' % (skyline_app, str(checked_for_skip_count))) skip_metrics_list = [] do_not_skip_metrics_list = []
def run(self): """ Called when the process intializes. """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log # In Vista the log management is handled be fetcher, the worker just # waits for the fetcher to do the log managment now = int(time()) log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = int(time()) else: now = log_wait_for + 1 logger.info('worker :: starting log management') if os.path.isfile(skyline_app_loglock): logger.error( 'error :: worker :: bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os_remove(skyline_app_loglock) logger.info('worker :: log lock file removed') except OSError: logger.error( 'error :: worker :: failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('worker :: bin/%s.d log management done' % skyline_app) logger.info('worker :: starting worker') try: VISTA_ENABLED = settings.VISTA_ENABLED logger.info('worker :: VISTA_ENABLED is set to %s' % str(VISTA_ENABLED)) except: VISTA_ENABLED = False logger.info( 'worker :: warning :: VISTA_ENABLED is not declared in settings.py, defaults to False' ) last_sent_to_graphite = int(time()) metrics_sent_to_flux = 0 # python-2.x and python3.x handle while 1 and while True differently # while 1: running = True while running: # Make sure Redis is up redis_up = False while not redis_up: try: redis_up = self.redis_conn.ping() if LOCAL_DEBUG: logger.info('worker :: redis is up') except: logger.error( 'worker :: cannot connect to redis at socket path %s' % (settings.REDIS_SOCKET_PATH)) sleep(2) # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded( skyline_app) metrics_data = [] redis_set = 'vista.fetcher.metrics.json' try: # Get a metric to validate from the Redis set # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # metrics_data = self.redis_conn.smembers(redis_set) metrics_data = self.redis_conn_decoded.smembers(redis_set) if LOCAL_DEBUG: logger.info('worker :: got redis set data - %s' % redis_set) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: retrieving Redis set %s data' % str(redis_set)) if not metrics_data: if LOCAL_DEBUG: logger.info('worker :: no data from Redis set %s' % str(redis_set)) sleep(5) for str_metric_data in metrics_data: delete_set_record = False remote_host_type = None try: # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Rather using get_redis_conn_decoded # if python_version == 3: # str_metric_data = str_metric_data.decode('UTF-8') metric_data = literal_eval(str_metric_data) remote_host_type = str(metric_data[0]['remote_host_type']) if LOCAL_DEBUG: logger.info( 'worker :: got data from Redis set for remote_host_type %s' % str(remote_host_type)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine remote_host_type from %s' % str(str_metric_data)) delete_set_record = True if not delete_set_record: try: remote_target = str(metric_data[0]['remote_target']) if LOCAL_DEBUG: logger.info( 'worker :: got data from Redis set for target %s' % str(remote_target)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine target from %s' % str(str_metric_data)) delete_set_record = True metric = None if not delete_set_record: try: metric = str(metric_data[0]['metric']) if LOCAL_DEBUG: logger.info( 'worker :: got data from Redis set for metric %s' % str(metric)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine metric from %s' % str(str_metric_data)) delete_set_record = True namespace_prefix = '' if not delete_set_record: try: namespace_prefix = str( metric_data[0]['namespace_prefix']) namespace_prefix = '%s.' % namespace_prefix if not namespace_prefix: namespace_prefix = '' if LOCAL_DEBUG: logger.info( 'worker :: got data from Redis set for namespace_prefix %s' % str(namespace_prefix)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine namespace_prefix from %s' % str(str_metric_data)) delete_set_record = True have_data = False if not delete_set_record: last_flux_metric_data = None cache_key = 'flux.last.%s' % (metric) try: if python_version == 3: redis_last_flux_metric_data = self.redis_conn.get( cache_key).decode('UTF-8') else: redis_last_flux_metric_data = self.redis_conn.get( cache_key) redis_last_flux_metric_data = redis_last_flux_metric_data last_flux_metric_data = literal_eval( redis_last_flux_metric_data) if LOCAL_DEBUG: logger.info( 'worker :: got last_flux_metric_data from Redis' ) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: retrieving Redis key %s data' % str(cache_key)) last_flux_metric_data = False last_flux_timestamp = None if last_flux_metric_data: try: last_flux_timestamp = int(last_flux_metric_data[0]) if LOCAL_DEBUG: logger.info( 'worker :: got last_flux_timestamp - %s' % str(last_flux_timestamp)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed determining last_flux_timestamp' ) last_flux_timestamp = False # Determine the timestamp of the current minute to apply # VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE time_now = int(time()) # current_minute = datetime.datetime.utcfromtimestamp(time_now).strftime('%Y-%m-%d %H:%M') current_minute_hour = int( datetime.datetime.utcfromtimestamp(time_now).strftime( '%H')) current_minute_minute = int( datetime.datetime.utcfromtimestamp(time_now).strftime( '%M')) current_datetime = datetime.datetime.utcfromtimestamp( time_now).replace(hour=current_minute_hour, minute=current_minute_minute, second=0, microsecond=0) current_minute_timestamp_start = int( current_datetime.strftime('%s')) datapoint = None last_timestamp_with_data = None timeseries = [] # @added 20200107 - Task #3376: Enable vista and flux to deal with lower frequency data metric_resolution = 60 metric_resolution_determined = False try: if python_version == 3: datapoints_str = literal_eval( metric_data[0]['datapoints']) metric_datapoints = literal_eval(datapoints_str) else: # metric_datapoints = metric_data[0]['datapoints'] datapoints_str = literal_eval( metric_data[0]['datapoints']) metric_datapoints = literal_eval(datapoints_str) # for value, timestamp in metric_data[0]['datapoints']: if LOCAL_DEBUG: len_metric_datapoints = len(metric_datapoints) logger.info( 'worker :: got %s metric_datapoints - %s' % (str(len_metric_datapoints), str(metric_datapoints))) # @added 20200107 - Task #3376: Enable vista and flux to deal with lower frequency data # Determine resolution resolution_timestamps = [] for metric_datapoint in metric_datapoints: timestamp = int(metric_datapoint[0]) resolution_timestamps.append(timestamp) timestamp_resolutions = [] if resolution_timestamps: last_timestamp = None for timestamp in resolution_timestamps: if last_timestamp: resolution = timestamp - last_timestamp timestamp_resolutions.append(resolution) last_timestamp = timestamp else: last_timestamp = timestamp if timestamp_resolutions: try: timestamp_resolutions_count = Counter( timestamp_resolutions) ordered_timestamp_resolutions_count = timestamp_resolutions_count.most_common( ) metric_resolution = int( ordered_timestamp_resolutions_count[0][0]) if metric_resolution > 0: metric_resolution_determined = True except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine metric_resolution from %s' % (str(metric_data))) if metric_resolution_determined: cache_key = 'vista.last.resolution.%s' % metric try: # Update Redis key self.redis_conn.setex(cache_key, 3600, metric_resolution) except: logger.error(traceback.format_exc()) logger.error( 'error :: fetcher :: failed to set Redis key - %s' % (cache_key)) for metric_datapoint in metric_datapoints: # @20191010 - Branch #3140: vista # fetcher passes through preformatted data points that # are in the same format/order for both graphite and # prometheus # if remote_host_type == 'graphite': # value = float(metric_datapoint[0]) # timestamp = int(metric_datapoint[1]) # if remote_host_type == 'prometheus': # value = float(metric_datapoint[1]) # timestamp = int(metric_datapoint[0]) timestamp = int(metric_datapoint[0]) value = float(metric_datapoint[1]) append_to_timeseries = False if last_flux_timestamp: if int(timestamp) > last_flux_timestamp: # timeseries.append([timestamp, value]) append_to_timeseries = True else: # timeseries.append([timestamp, value]) append_to_timeseries = True # Here if the timestamp of the data point falls # within the current minute, it is discarded and not # sent to flux, to ensure that high frequency metrics # can have their minutely bins fully populated before # they are submitted to Graphite if settings.VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE: if int(timestamp ) >= current_minute_timestamp_start: append_to_timeseries = False if append_to_timeseries: timeseries.append([timestamp, value]) last_timestamp_with_data = 0 for timestamp, value in timeseries[::-1]: has_value = False if value == 0.0: has_value = True if value: has_value = True if has_value: last_timestamp_with_data = int(timestamp) datapoint = value break if last_timestamp_with_data: have_data = True except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine datapoints from %s' % (str(metric_data))) delete_set_record = True if not timeseries: logger.info( 'worker :: after processing, there were no valid data points in %s' % (str(metric_data))) delete_set_record = True if not have_data and timeseries: logger.error( 'error :: worker :: failed to determine last_timestamp_with_data from %s' % (str(metric_data))) delete_set_record = True if delete_set_record: try: redis_set = 'vista.fetcher.metrics.json' self.redis_conn.srem(redis_set, str_metric_data) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to delete data from Redis set %s, data - ' % (str(redis_set), str(str_metric_data))) continue if not metric: continue valid_data = True if last_flux_timestamp and last_timestamp_with_data: if int(last_timestamp_with_data) <= last_flux_timestamp: valid_data = False if not valid_data: redis_set = 'vista.fetcher.metrics.json' logger.info( 'worker :: no valid data in fetched data removing from Redis set %s - data - %s' % (redis_set, str(str_metric_data))) try: self.redis_conn.srem(redis_set, str_metric_data) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to delete data from Redis set %s, data - %s' % (redis_set, str(str_metric_data))) continue if valid_data: flux_host = 'http://%s:%s' % (settings.FLUX_IP, settings.FLUX_PORT) # Resample resample_at = None if resample_at == 'none' or resample_at == '0Min': resample_at = False if resample_at == 'None' or resample_at == '0min': resample_at = False if resample_at is None or resample_at == '0' or resample_at == 0: resample_at = False if resample_at: try: df = pd.DataFrame(timeseries) df.columns = ['timestamp', 'value'] df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s', origin='unix') df = df.set_index('timestamp') resampled_df = df.resample(resample_at).sum() resampled_timeseries = [] for index, row in resampled_df.iterrows(): timestamp = int(index.strftime('%s')) resampled_timeseries.append( [timestamp, row[0]]) timeseries = resampled_timeseries timeseries_length = len(timeseries) logger.info( 'worker :: time series resampled at %s resulting in %s data points to send to Graphite' % (str(resample_at), str(timeseries_length))) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to resample time series at %s for %s with time series %s' % (str(resample_at), str(metric), str(timeseries))) for timestamp, value in timeseries: flux_url = '%s/metric_data?metric=%s&value=%s×tamp=%s&key=%s' % ( flux_host, metric, str(datapoint), str(timestamp), settings.FLUX_SELF_API_KEY) success = False try: response = requests.get(flux_url) if response.status_code == 200: success = True elif response.status_code == 204: success = True except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to request %s' % str(flux_url)) if not success: logger.error( 'error :: worker :: http status code - %s, reason - %s' % (str(response.status_code), str(response.reason))) if success: metrics_sent_to_flux += 1 redis_set = 'vista.fetcher.metrics.json' # @added 20191011 - Task #3258: Reduce vista logging timeseries_length = len(timeseries) # @modified 20191011 - Task #3258: Reduce vista logging # logger.info('worker :: data submitted to flux OK, removing data from Redis set %s' % ( # redis_set)) logger.info( 'worker :: %s data points submitted to flux OK for %s' % (str(timeseries_length), metric)) try: self.redis_conn.srem(redis_set, str_metric_data) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to delete data from Redis set %s, data - %s' % (redis_set, str(str_metric_data))) redis_set = 'vista.fetcher.unique_metrics' try: self.redis_conn.sadd(redis_set, remote_target) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to add %s to Redis set %s' % (remote_target, redis_set)) time_now = int(time()) if (time_now - last_sent_to_graphite) >= 60: logger.info( 'worker :: metrics sent_to_flux in last 60 seconds - %s' % str(metrics_sent_to_flux)) send_metric_name = '%s.metrics_sent_to_flux' % skyline_app_graphite_namespace try: send_graphite_metric(parent_skyline_app, send_metric_name, str(metrics_sent_to_flux)) last_sent_to_graphite = int(time()) metrics_sent_to_flux = 0 except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send_graphite_metric %s with %s' % (send_metric_name, str(metrics_sent_to_flux)))
def run(self): """ Called when the process intializes. """ def pickle_data_to_graphite(data): message = None try: payload = pickle.dumps(data, protocol=2) header = struct.pack("!L", len(payload)) message = header + payload except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to pickle to send to Graphite' ) return False if message: try: sock = socket.socket() sock.connect((CARBON_HOST, FLUX_CARBON_PICKLE_PORT)) sock.sendall(message) sock.close() except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to send pickle data to Graphite' ) return False else: logger.error( 'error :: populate_metric_worker :: failed to pickle metric data into message' ) return False return True logger.info('populate_metric_worker :: starting worker') # Populate API keys and tokens in memcache # python-2.x and python3.x handle while 1 and while True differently # while 1: running = True while running: # Make sure Redis is up redis_up = False while not redis_up: try: redis_up = self.redis_conn.ping() except: logger.error( 'populate_metric_worker :: cannot connect to Redis at socket path %s' % (settings.REDIS_SOCKET_PATH)) sleep(2) # @modified 20191111 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) self.redis_conn = get_redis_conn(skyline_app) # @added 20191128 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 self.redis_conn_decoded = get_redis_conn_decoded( skyline_app) metricDict = None try: # Get a metric from the queue with a 1 second timeout, each # metric item on the queue is a list e.g. # metric_json = [metricName, metricValue, metricTimestamp] metricDict = self.q.get(True, 1) logger.info('populate_metric_worker :: processing queue item') except Empty: logger.info( 'populate_metric_worker :: queue is empty and timed out, sleeping for 30 seconds' ) sleep(30) except NotImplementedError: pass except KeyboardInterrupt: logger.info( 'populate_metric_worker :: server has been issued a user signal to terminate - KeyboardInterrupt' ) except SystemExit: logger.info( 'populate_metric_worker :: server was interrupted - SystemExit' ) except Exception as e: logger.error('error :: populate_metric_worker :: %s' % (str(e))) if not metricDict: continue try: remote_host_type = str(metricDict['remote_host_type']) remote_target = str(metricDict['remote_target']) metric = str(metricDict['metric']) namespace_prefix = str(metricDict['namespace_prefix']) if not namespace_prefix: namespace_prefix = '' if namespace_prefix == 'None': namespace_prefix = '' key = str(metricDict['key']) token = str(metricDict['token']) user = str(metricDict['user']) password = str(metricDict['password']) if metricDict['fetch_resolution_urls'] == 'None': logger.info( 'No fetch_resolution_urls declared for %s, nothing to do' % remote_target) continue if metricDict['fetch_resolution_urls'] == '()' or metricDict[ 'fetch_resolution_urls'] == (): logger.info( 'No fetch_resolution_urls declared for %s, nothing to do' % remote_target) continue fetch_resolution_urls_str = literal_eval( metricDict['fetch_resolution_urls']) fetch_resolution_urls = literal_eval(fetch_resolution_urls_str) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to read from metricData' ) if LOCAL_DEBUG: try: logger.info( 'populate_metric_worker :: remote_target from metricData set to %s' % remote_target) logger.info( 'populate_metric_worker :: metric from metricData set to %s' % metric) logger.info( 'populate_metric_worker :: namespace_prefix from metricData set to %s' % namespace_prefix) logger.info( 'populate_metric_worker :: key from metricData set to %s' % key) logger.info( 'populate_metric_worker :: token from metricData set to %s' % token) logger.info( 'populate_metric_worker :: user from metricData set to %s' % user) logger.info( 'populate_metric_worker :: password from metricData set to %s' % password) logger.info( 'populate_metric_worker :: fetch_resolution_urls from metricData set to %s' % str(fetch_resolution_urls)) if fetch_resolution_urls: for fetch_url in fetch_resolution_urls: logger.info( 'populate_metric_worker :: a fetch_url from metricData is set to %s' % str(fetch_url)) logger.info( 'populate_metric_worker :: metric is set to %s' % metric) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to read from metricData' ) # Best effort to de-duplicate the data sent to Graphite cache_key = 'flux.last.%s' % metric last_flux_timestamp = None try: # @modified 20191128 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # redis_last_metric_data = self.redis_conn.get(cache_key).decode('utf-8') redis_last_metric_data = self.redis_conn_decoded.get(cache_key) last_metric_data = literal_eval(redis_last_metric_data) last_flux_timestamp = int(last_metric_data[0]) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to determine last_flux_timestamp from Redis key %s' % cache_key) last_flux_timestamp = False recent_last_flux_timestamp_present = False if last_flux_timestamp: now = int(time()) if (now - last_flux_timestamp) < 600: recent_last_flux_timestamp_present = True # Skyline has the metric so adding it to the vista.fetcher # Redis set redis_set = 'vista.fetcher.unique_metrics' data = str(remote_target) try: self.redis_conn.sadd(redis_set, data) logger.info( 'populate_metric_worker :: the last flux update for %s was less than 600 seconds ago, added metric to %s' % (metric, redis_set)) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to add %s to Redis set %s' % (str(data), str(redis_set))) # continue if not last_flux_timestamp: # Check Graphite does not have the data or determine what the # last data Graphite has is logger.info( 'populate_metric_worker :: no last_flux_timestamp was found in Redis for %s, checking if Graphite has data' % (metric)) check_graphite_from = [ '-50mins', '-6hours', '-24hours', '-7days', '-30days', '-90days' ] timeseries = [] for graphite_from in check_graphite_from: if last_flux_timestamp: break logger.info( 'populate_metric_worker :: checking %s in Graphite from %s' % (metric, graphite_from)) got_data = False try: # We use absolute time so that if there is a lag in mirage the correct # timeseries data is still surfaced relevant to the anomalous datapoint # timestamp if settings.GRAPHITE_PORT != '': url = '%s://%s:%s/%s/?from=%s&target=%s&format=json' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, str(settings.GRAPHITE_PORT), settings.GRAPHITE_RENDER_URI, graphite_from, metric) else: url = '%s://%s/%s/?from=%s&target=%s&format=json' % ( settings.GRAPHITE_PROTOCOL, settings.GRAPHITE_HOST, settings.GRAPHITE_RENDER_URI, graphite_from, metric) logger.info( 'populate_metric_worker :: using Graphite URL - %s' % (url)) r = requests.get(url) if r.status_code == 200: js = [] try: js = r.json() except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to get data from Graphite' ) continue if not js: logger.info( 'populate_metric_worker :: %s not present in Graphite from %s' % (metric, graphite_from)) continue got_data = True logger.info( 'populate_metric_worker :: %s present in Graphite from %s' % (metric, graphite_from)) else: logger.info( 'populate_metric_worker :: %s not present in Graphite from %s' % (metric, graphite_from)) continue except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to get data from Graphite' ) continue datapoints = [] if got_data: try: js = r.json() datapoints = js[0]['datapoints'] logger.info( 'populate_metric_worker :: %s data points are present in the Graphite %s data' % (str(len(datapoints)), str(graphite_from))) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to get data from Graphite' ) for datapoint in datapoints: try: value = float(datapoint[0]) timestamp = int(datapoint[1]) new_datapoint = [timestamp, value] timeseries.append(new_datapoint) except: # nosec continue last_timestamp_with_data = None for timestamp, value in timeseries[::-1]: has_value = False if value == 0.0: has_value = True if value == 0: has_value = True if value: has_value = True if has_value: last_timestamp_with_data = int(timestamp) datapoint = value break if last_timestamp_with_data: # Here we set this as the missing last_flux_timestamp last_flux_timestamp = last_timestamp_with_data recent_last_flux_timestamp_present = True logger.info( 'populate_metric_worker :: %s last timestamp in Graphite from %s is %s, using as last_flux_timestamp' % (metric, str(graphite_from), str(last_flux_timestamp))) timeseries = [] start_populating = int(time()) datapoints_added_to_timeseries = 0 datapoints_already_populated = 0 datapoints_with_no_value = 0 timestamp = None value = None # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints # And set flux.last key is the returned value from the remote is # null so that time series that are mostly null do not keep on # getting added to flux populate_metric by Vista raw_timeseries = [] for fetch_url in fetch_resolution_urls: # if recent_last_flux_timestamp_present and remote_host_type == 'prometheus': # This was for the query query and resample method and not for # the query_range query if recent_last_flux_timestamp_present and remote_host_type == 'prometheus_query_range_NOT_FOR_GE_11000': try: logger.info( 'populate_metric_worker :: recent data so replacing fetch_url %s ' % (fetch_url)) seconds_to_fetch = int(time()) - last_flux_timestamp minutes_to_fetch = int(seconds_to_fetch / 60) + 2 re_mins_to_fetch = '[%sm]' % str(minutes_to_fetch) fetch_url = re.sub(r'\[.*\]', re_mins_to_fetch, fetch_url) encoded_re_mins_to_fetch = '%%5B%sm%%5D' % str( minutes_to_fetch) fetch_url = re.sub(r'%5B.*%5D', encoded_re_mins_to_fetch, fetch_url) logger.info( 'populate_metric_worker :: replaced fetch_url %s ' % (fetch_url)) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to rewrite URL' ) if recent_last_flux_timestamp_present and remote_host_type == 'prometheus': try: logger.info( 'populate_metric_worker :: recent data so replacing fetch_url %s ' % (fetch_url)) seconds_to_fetch = int(time()) - last_flux_timestamp minutes_to_fetch = int(seconds_to_fetch / 60) + 2 re_mins_to_fetch = '[%sm]' % str(minutes_to_fetch) fetch_url = re.sub(r'\[.*\]', re_mins_to_fetch, fetch_url) encoded_re_mins_to_fetch = '%%5B%sm%%5D' % str( minutes_to_fetch) fetch_url = re.sub(r'%5B.*%5D', encoded_re_mins_to_fetch, fetch_url) logger.info( 'populate_metric_worker :: replaced fetch_url %s ' % (fetch_url)) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to rewrite URL' ) success = False try: logger.info( 'populate_metric_worker :: getting data from %s' % str(fetch_url)) response = requests.get(fetch_url) if response.status_code == 200: success = True except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: http status code - %s, reason - %s' % (str(response.status_code), str(response.reason))) logger.error( 'error :: populate_metric_worker :: failed to get data from %s' % str(fetch_url)) if not success: continue datapoints = None try: js = response.json() if remote_host_type == 'graphite': datapoints = js[0]['datapoints'] if remote_host_type == 'prometheus': datapoints = js['data']['result'][0]['values'] datapoints_fetched = len(datapoints) logger.info( 'populate_metric_worker :: retrieved %s data points from %s' % (str(datapoints_fetched), str(fetch_url))) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to get data from %s' % str(fetch_url)) # Example # datapoints[0] # [7.3, 1556817000] # Add each data point and timestamp to the timeseries list so # they can be sent to Graphite if not datapoints: logger.info( 'populate_metric_worker :: failed to get any data from %s' % str(fetch_url)) continue # @added 20191108 - Bug #3312: flux - populate_metric_worker - handle None in datapoints valid_datapoints = [] for datapoint in datapoints: value = None timestamp = None if remote_host_type == 'graphite': # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints raw_timeseries.append([datapoint[1], datapoint[0]]) try: raw_value = datapoint[0] if raw_value is None: datapoints_with_no_value += 1 continue value = float(datapoint[0]) timestamp = int(datapoint[1]) valid_datapoints.append([value, timestamp]) except: continue if remote_host_type == 'prometheus': # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints raw_timeseries.append([datapoint[0], datapoint[1]]) try: raw_value = datapoint[1] if raw_value is None: datapoints_with_no_value += 1 continue timestamp = int(datapoint[0]) value = float(datapoint[1]) except: continue valid_datapoints.append([timestamp, value]) datapoints = valid_datapoints # Order the time series by timestamp as the tuple can shift # order resulting in more recent data being added before older # data datapoints.sort() # Determine the timestamp of the current minute to apply # VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE time_now = int(time()) current_minute_hour = int( datetime.datetime.utcfromtimestamp(time_now).strftime( '%H')) current_minute_minute = int( datetime.datetime.utcfromtimestamp(time_now).strftime( '%M')) current_datetime = datetime.datetime.utcfromtimestamp( time_now).replace(hour=current_minute_hour, minute=current_minute_minute, second=0, microsecond=0) current_minute_timestamp_start = int( current_datetime.strftime('%s')) datapoints_in_current_minute = 0 last_error = None value = None timestamp = None for datapoint in datapoints: try: if remote_host_type == 'graphite': try: raw_value = datapoint[0] if raw_value is None: continue value = float(datapoint[0]) timestamp = int(datapoint[1]) except: continue if remote_host_type == 'prometheus': # timestamp = int(datapoint[0]) try: timestamp = int(datapoint[0]) value = float(datapoint[1]) except: continue submit_data = True if last_flux_timestamp: if timestamp <= last_flux_timestamp: submit_data = False datapoints_already_populated += 1 # Here if the timestamp of the data point falls # within the current minute, it is discarded and not # sent to flux, to ensure that high frequency metrics # can have their minutely bins fully populated before # they are submitted to Graphite if settings.VISTA_DO_NOT_SUBMIT_CURRENT_MINUTE: if timestamp >= current_minute_timestamp_start: submit_data = False datapoints_in_current_minute += 1 if submit_data: new_datapoint = [timestamp, value] timeseries.append(new_datapoint) datapoints_added_to_timeseries += 1 # nosec to exclude from bandit tests except: # nosec last_error = traceback.format_exc() datapoints_with_no_value += 1 continue if last_error: logger.error(last_error) logger.error( 'error :: populate_metric_worker :: the above is the last_error encountered processing %s' % (str(metric))) if datapoints_with_no_value: logger.info( 'populate_metric_worker :: %s of the fetched records were discarded as they had value None' % (str(datapoints_with_no_value))) if datapoints_in_current_minute: logger.info( 'populate_metric_worker :: %s of the fetched records were discarded as they fall within the current minute' % (str(datapoints_in_current_minute))) logger.info( 'populate_metric_worker :: %s of the fetched data points are older than the last known flux timestamp' % (str(datapoints_already_populated))) logger.info( 'populate_metric_worker :: added %s data points to the time series to submit to Graphite' % (str(datapoints_added_to_timeseries))) end_fecthing = int(time()) seconds_to_fetch = end_fecthing - start_populating if timestamp: logger.info( 'populate_metric_worker :: last fetched value - %s, timestamp %s' % (str(value), str(timestamp))) logger.info( 'populate_metric_worker :: %s data point fecthed for %s in %s seconds' % (str(datapoints_added_to_timeseries), remote_target, str(seconds_to_fetch))) # @added 20191111 - Bug #3312: flux - populate_metric_worker - handle None in datapoints # And set flux.last key is the returned value from the remote is # null so that time series that are mostly null do not keep on # getting added to flux populate_metric by Vista if not timeseries: set_flux_key = False try: sorted_raw_timeseries = sorted(raw_timeseries, key=lambda x: x[0]) last_ts = sorted_raw_timeseries[-1][0] if int(last_ts) > (end_fecthing - 120): if sorted_raw_timeseries[-1][1] is None: set_flux_key = True except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to determine if last value was null' ) if set_flux_key: try: # Update Redis flux key cache_key = 'flux.last.%s' % metric metric_data = [int(last_ts), None] self.redis_conn.set(cache_key, str(metric_data)) logger.info( 'populate_metric_worker :: even though no data points so as to not loop round on this metric, set the metric Redis key - %s - %s' % (cache_key, str(metric_data))) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: even though no data points, failed to set Redis key - %s - %s' % (cache_key, str(metric_data))) # Adding to the vista.fetcher.unique_metrics Redis set redis_set = 'vista.fetcher.unique_metrics' data = str(remote_target) try: self.redis_conn.sadd(redis_set, data) logger.info( 'populate_metric_worker :: even though no data points, added %s to Redis set %s' % (remote_target, redis_set)) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: even though no data points, failed to add %s to Redis set %s' % (str(data), str(redis_set))) if not timeseries: logger.info( 'populate_metric_worker :: no data in the timeseries list for the time series for %s' % metric) continue # Order the time series by timestamp as the tuple can shift # order resulting in more recent data being added before older # data timeseries.sort() timeseries_length = len(timeseries) # Resample resample_at = '1Min' if resample_at: try: df = pd.DataFrame(timeseries) df.columns = ['timestamp', 'value'] df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s', origin='unix') df = df.set_index('timestamp') # resampled_df = df.resample(resample_at).sum() # Use the mean as Prometheus uses the average in the # query_range API method resampled_df = df.resample(resample_at).mean() resampled_timeseries = [] for index, row in resampled_df.iterrows(): timestamp = int(index.strftime('%s')) resampled_timeseries.append([timestamp, row[0]]) timeseries = resampled_timeseries timeseries_length = len(timeseries) logger.info( 'populate_metric_worker :: time series resampled at %s resulting in %s data points to send to Graphite' % (str(resample_at), str(timeseries_length))) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to resample time series for %s' % str(metric)) logger.info( 'populate_metric_worker :: %s data points to send to Graphite' % (str(timeseries_length))) timestamp = None value = None sent_to_graphite = 0 # use_pickle = False use_pickle = True if not use_pickle: for timestamp, value in timeseries: try: graphyte.send(metric, float(value), int(timestamp)) sent_to_graphite += 1 if sent_to_graphite % 1000 == 0: logger.info( 'populate_metric_worker :: submitted %s of %s data points to Graphite so far' % (str(sent_to_graphite), str(timeseries_length))) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to send metric data to Graphite for %s' % str(metric)) else: listOfMetricTuples = [] try: for timestamp, value in timeseries: tuple_data = (metric, (int(timestamp), float(value))) listOfMetricTuples.append(tuple_data) sent_to_graphite += 1 except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to populate listOfMetricTuples for %s' % str(metric)) if listOfMetricTuples: data_points_sent = 0 smallListOfMetricTuples = [] tuples_added = 0 for data in listOfMetricTuples: smallListOfMetricTuples.append(data) tuples_added += 1 if tuples_added >= 1000: pickle_data_sent = pickle_data_to_graphite( smallListOfMetricTuples) if pickle_data_sent: data_points_sent += tuples_added logger.info( 'populate_metric_worker :: sent %s/%s of %s data points to Graphite via pickle for %s' % (str(tuples_added), str(data_points_sent), str(timeseries_length), metric)) sent_to_graphite += len( smallListOfMetricTuples) smallListOfMetricTuples = [] tuples_added = 0 else: logger.error( 'error :: populate_metric_worker :: failed to send %s data points to Graphite via pickle for %s' % (str(tuples_added), metric)) if smallListOfMetricTuples: tuples_to_send = len(smallListOfMetricTuples) pickle_data_sent = pickle_data_to_graphite( smallListOfMetricTuples) if pickle_data_sent: data_points_sent += tuples_to_send logger.info( 'populate_metric_worker :: sent the last %s/%s of %s data points to Graphite via pickle for %s' % (str(tuples_to_send), str(data_points_sent), str(timeseries_length), metric)) else: logger.error( 'error :: populate_metric_worker :: failed to send the last %s data points to Graphite via pickle for %s' % (str(tuples_to_send), metric)) logger.info( 'populate_metric_worker :: sent %s data points to Graphite for %s' % (str(sent_to_graphite), metric)) try: skyline_metric = '%s.datapoints_sent_to_graphite' % ( skyline_app_graphite_namespace) # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host # graphyte.send(skyline_metric, float(sent_to_graphite), int(time())) send_graphite_metric(skyline_app, skyline_metric, float(sent_to_graphite)) logger.info( 'populate_metric_worker :: submitted %s to Graphite for %s' % (str(float(sent_to_graphite)), skyline_metric)) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to send metric data to Graphite for %s' % str(skyline_metric)) has_value = False if value == 0.0: has_value = True if value == 0: has_value = True if value: has_value = True if timestamp and has_value: try: # Update Redis flux key cache_key = 'flux.last.%s' % metric metric_data = [int(timestamp), float(value)] self.redis_conn.set(cache_key, str(metric_data)) logger.info( 'populate_metric_worker :: set the metric Redis key - %s - %s' % (cache_key, str(metric_data))) except: logger.error(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to set Redis key - %s - %s' % (cache_key, str(metric_data))) # Adding to the vista.fetcher.unique_metrics Redis set redis_set = 'vista.fetcher.unique_metrics' data = str(remote_target) try: self.redis_conn.sadd(redis_set, data) logger.info( 'populate_metric_worker :: added %s to Redis set %s' % (remote_target, redis_set)) except: logger.info(traceback.format_exc()) logger.error( 'error :: populate_metric_worker :: failed to add %s to Redis set %s' % (str(data), str(redis_set))) end_populating = int(time()) seconds_to_run = end_populating - start_populating logger.info( 'populate_metric_worker :: %s populated to Graphite in %s seconds' % (metric, str(seconds_to_run)))
def run(self): """ Called when the process intializes. """ logger.info('worker :: starting worker') last_sent_to_graphite = int(time()) metrics_sent_to_graphite = 0 # Populate API keys and tokens in memcache # python-2.x and python3.x handle while 1 and while True differently # while 1: running = True while running: # Make sure Redis is up redis_up = False while not redis_up: try: redis_up = self.redis_conn.ping() except: logger.error( 'worker :: cannot connect to redis at socket path %s' % (settings.REDIS_SOCKET_PATH)) sleep(2) # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Use get_redis_conn and get_redis_conn_decoded # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded( skyline_app) if LOCAL_DEBUG: try: metric_data_queue_size = self.q.qsize() logger.info( 'worker :: debug :: flux.httpMetricDataQueue queue size - %s' % str(metric_data_queue_size)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of queue flux.httpMetricDataQueue' ) metric_data = None try: # Get a metric from the queue with a 1 second timeout, each # metric item on the queue is a list e.g. # metric_data = [metricName, metricValue, metricTimestamp] metric_data = self.q.get(True, 1) except Empty: logger.info('worker :: queue is empty and timed out') sleep(1) except NotImplementedError: pass except KeyboardInterrupt: logger.info( 'worker :: server has been issued a user signal to terminate - KeyboardInterrupt' ) except SystemExit: logger.info('worker :: server was interrupted - SystemExit') except Exception as e: logger.error('error :: worker :: %s' % (str(e))) # @added 20200206 - Feature #3444: Allow flux to backfill # Added backfill backfill = False if metric_data: try: metric = str(metric_data[0]) value = float(metric_data[1]) timestamp = int(metric_data[2]) # @added 20200206 - Feature #3444: Allow flux to backfill # Added backfill backfill = int(metric_data[3]) if LOCAL_DEBUG: logger.info( 'worker :: debug :: queue item found - %s' % str(metric_data)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to interpolate metric, value, timestamp from metric_data - %s' % str(metric_data)) continue if settings.FLUX_SEND_TO_CARBON: # Best effort de-duplicate the data valid_data = True # @added 20200818 - Feature #3694: flux - POST multiple metrics # Handle Redis and literal_eval separately redis_last_metric_data = None # @modified 20200206 - Feature #3444: Allow flux to backfill # Only check flux.last key if this is not backfill if not backfill: cache_key = 'flux.last.%s' % metric last_metric_timestamp = None try: # @modified 20191128 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # redis_last_metric_data = self.redis_conn.get(cache_key) redis_last_metric_data = self.redis_conn_decoded.get( cache_key) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine last_metric_timestamp from Redis key %s' % str(cache_key)) redis_last_metric_data = None # @modified 20200818 - Feature #3694: flux - POST multiple metrics # Handle Redis and literal_eval separately, only # literal_eval if Redis had data for the key if redis_last_metric_data: try: last_metric_data = literal_eval( redis_last_metric_data) last_metric_timestamp = int( last_metric_data[0]) if LOCAL_DEBUG: logger.info( 'worker :: debug :: last_metric_timestamp for %s from %s is %s' % (metric, str(cache_key), str(last_metric_timestamp))) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine last_metric_timestamp from Redis key %s' % str(cache_key)) last_metric_timestamp = False if last_metric_timestamp: if timestamp <= last_metric_timestamp: valid_data = False if LOCAL_DEBUG: logger.info( 'worker :: debug :: not valid data - the queue data timestamp %s is <= to the last_metric_timestamp %s for %s' % (str(timestamp), str(last_metric_timestamp), metric)) if valid_data: submittedToGraphite = False try: graphyte.send(metric, value, timestamp) submittedToGraphite = True logger.info( 'worker :: sent %s, %s, %s to Graphite' % (str(metric), str(value), str(timestamp))) metrics_sent_to_graphite += 1 except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send metric data to Graphite for %s' % str(metric)) metric = None if submittedToGraphite: # Update the metric Redis flux key # @modified 20200206 - Feature #3444: Allow flux to backfill # Only update the flux.last key if this is not backfill if not backfill: metric_data = [timestamp, value] self.redis_conn.set(cache_key, str(metric_data)) # @added 20200213 - Bug #3448: Repeated airgapped_metrics else: # @added 20200213 - Bug #3448: Repeated airgapped_metrics # Add a flux.filled key to Redis with a expiry # set to FULL_DURATION so that Analyzer knows to # sort and deduplicate the Redis time series # data as carbon-relay will send it to Horizon # and the datapoints will be out of order in the # Redis key try: flux_filled_key = 'flux.filled.%s' % str( metric) self.redis_conn.setex( flux_filled_key, settings.FULL_DURATION, int(time())) logger.info('worker :: set Redis key %s' % (str(flux_filled_key))) except Exception as e: logger.error( 'error :: failed to could not set Redis flux.filled key: %s' % e) else: logger.info( 'worker :: discarded %s, %s, %s as a data point for %s has already been submitted to Graphite' % (str(metric), str(value), str(timestamp), str(timestamp))) else: logger.info( 'worker :: settings.FLUX_SEND_TO_CARBON is set to %s, discarded %s, %s, %s' % (str(settings.FLUX_SEND_TO_CARBON), str(metric), str(value), str(timestamp))) if settings.FLUX_SEND_TO_STATSD: statsd_conn.incr(metric, value, timestamp) logger.info('worker sent %s, %s, %s to statsd' % (metric, str(value), str(timestamp))) time_now = int(time()) if (time_now - last_sent_to_graphite) >= 60: logger.info( 'worker :: metrics_sent_to_graphite in last 60 seconds - %s' % str(metrics_sent_to_graphite)) skyline_metric = '%s.metrics_sent_to_graphite' % skyline_app_graphite_namespace try: # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host # graphyte.send(skyline_metric, metrics_sent_to_graphite, time_now) send_graphite_metric(skyline_app, skyline_metric, metrics_sent_to_graphite) last_sent_to_graphite = int(time()) metrics_sent_to_graphite = 0 except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send_graphite_metric %s with %s' % (skyline_metric, str(metrics_sent_to_graphite)))
def listen_pickle(self): """ Listen for pickles over tcp """ while 1: try: # Set up the TCP listening socket s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) s.bind((self.ip, self.port)) s.setblocking(1) s.listen(5) logger.info('%s :: listening over tcp for pickles on %s' % (skyline_app, str(self.port))) (conn, address) = s.accept() logger.info('%s :: connection from %s on %s' % (skyline_app, str(address[0]), str(self.port))) chunk = [] while 1: self.check_if_parent_is_alive() try: # @modified 20191016 - Task #3278: py3 handle bytes and not str in pickles # Branch #3262: py3 # Added ability to log and debug if LOCAL_DEBUG: length = None body = None try: length = Struct('!I').unpack( self.read_all(conn, 4)) except: logger.error(traceback.format_exc()) logger.error( 'error :: listen :: could not determine length' ) if length: try: body = self.read_all(conn, length[0]) except: logger.error(traceback.format_exc()) logger.error( 'error :: listen :: could not determine body' ) else: length = Struct('!I').unpack(self.read_all( conn, 4)) body = self.read_all(conn, length[0]) # Iterate and chunk each individual datapoint for bunch in self.gen_unpickle(body): for metric in bunch: chunk.append(metric) # Queue the chunk and empty the variable if len(chunk) > settings.CHUNK_SIZE: try: self.q.put(list(chunk), block=False) chunk[:] = [] # Drop chunk if queue is full except Full: chunks_dropped = str(len(chunk)) logger.info( '%s :: pickle queue is full, dropping %s datapoints' % (skyline_app, chunks_dropped)) # self.send_graphite_metric( # 'skyline.horizon.' + SERVER_METRIC_PATH + 'pickle_chunks_dropped', # chunks_dropped) send_metric_name = '%s.pickle_chunks_dropped' % skyline_app_graphite_namespace send_graphite_metric( skyline_app, send_metric_name, chunks_dropped) chunk[:] = [] except Exception as e: logger.info(e) logger.info( '%s :: incoming pickle connection dropped, attempting to reconnect' % skyline_app) break except Exception as e: logger.info('%s :: can not connect to socket: %s' % (skyline_app, str(e))) break
def run(self): """ Called when the process intializes. """ logger.info('worker :: starting worker') last_sent_to_graphite = int(time()) metrics_sent_to_graphite = 0 # Populate API keys and tokens in memcache # python-2.x and python3.x handle while 1 and while True differently # while 1: running = True while running: # Make sure Redis is up redis_up = False while not redis_up: try: redis_up = self.redis_conn.ping() except: logger.error('worker :: cannot connect to redis at socket path %s' % (settings.REDIS_SOCKET_PATH)) sleep(2) if settings.REDIS_PASSWORD: self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) else: self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) metric_data = None try: # Get a metric from the queue with a 1 second timeout, each # metric item on the queue is a list e.g. # metric_data = [metricName, metricValue, metricTimestamp] metric_data = self.q.get(True, 1) except Empty: logger.info('worker :: queue is empty and timed out') sleep(1) except NotImplementedError: pass except KeyboardInterrupt: logger.info('worker :: server has been issued a user signal to terminate - KeyboardInterrupt') except SystemExit: logger.info('worker :: server was interrupted - SystemExit') except Exception as e: logger.error('error :: worker :: %s' % (str(e))) if metric_data: try: metric = str(metric_data[0]) value = float(metric_data[1]) timestamp = int(metric_data[2]) except: logger.error(traceback.format_exc()) logger.error('error :: worker :: failed to interpolate metric, value, timestamp from metric_data - %s' % str(metric_data)) continue if settings.FLUX_SEND_TO_CARBON: # Best effort de-duplicate the data valid_data = True cache_key = 'flux.last.%s' % metric last_metric_timestamp = None try: redis_last_metric_data = self.redis_conn.get(cache_key) last_metric_data = literal_eval(redis_last_metric_data) last_metric_timestamp = int(last_metric_data[0]) except: last_metric_timestamp = False if last_metric_timestamp: if timestamp <= last_metric_timestamp: valid_data = False if valid_data: submittedToGraphite = False try: graphyte.send(metric, value, timestamp) submittedToGraphite = True logger.info('worker :: sent %s, %s, %s to Graphite' % (metric, str(value), str(timestamp))) metrics_sent_to_graphite += 1 except: logger.error(traceback.format_exc()) logger.error('error :: worker :: failed to send metric data to Graphite for %s' % str(metric)) metric = None if submittedToGraphite: # Update the metric Redis flux key metric_data = [timestamp, value] self.redis_conn.set(cache_key, str(metric_data)) else: logger.info('worker discarded %s, %s, %s as a data point for %s has already been submitted to Graphite' % ( metric, str(value), str(timestamp), str(timestamp))) if settings.FLUX_SEND_TO_STATSD: statsd_conn.incr(metric, value, timestamp) logger.info('worker sent %s, %s, %s to statsd' % (metric, str(value), str(timestamp))) time_now = int(time()) if (time_now - last_sent_to_graphite) >= 60: logger.info('worker :: metrics_sent_to_graphite in last 60 seconds - %s' % str(metrics_sent_to_graphite)) skyline_metric = '%s.metrics_sent_to_graphite' % skyline_app_graphite_namespace try: # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host # graphyte.send(skyline_metric, metrics_sent_to_graphite, time_now) send_graphite_metric(skyline_app, skyline_metric, metrics_sent_to_graphite) last_sent_to_graphite = int(time()) metrics_sent_to_graphite = 0 except: logger.error(traceback.format_exc()) logger.error('error :: worker :: failed to send_graphite_metric %s with %s' % ( skyline_metric, str(metrics_sent_to_graphite)))
def run(self): """ Called when the process intializes. """ def pickle_data_to_graphite(data): message = None try: payload = pickle.dumps(data, protocol=2) header = struct.pack("!L", len(payload)) message = header + payload except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to pickle to send to Graphite') return False if message: try: sock = socket.socket() sock.connect( (CARBON_HOST, settings.FLUX_CARBON_PICKLE_PORT)) sock.sendall(message) sock.close() except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send pickle data to Graphite' ) return False else: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to pickle metric data into message' ) return False return True def submit_pickle_data_to_graphite(pickle_data): # @modified 20201207 - Task #3864: flux - try except everything try: number_of_datapoints = len(pickle_data) except Exception as e: logger.error( 'error :: worker :: could not determine number_of_datapoints from len(pickle_data) - %s' % str(e)) return False data_points_sent = 0 smallListOfMetricTuples = [] tuples_added = 0 for data in pickle_data: # @modified 20201207 - Task #3864: flux - try except everything try: smallListOfMetricTuples.append(data) tuples_added += 1 if tuples_added >= 480: # @modified 20201207 - Task #3864: flux - try except everything try: pickle_data_sent = pickle_data_to_graphite( smallListOfMetricTuples) except Exception as e: logger.error( 'error :: worker :: pickle_data_to_graphite error - %s' % str(e)) pickle_data_sent = False # Reduce the speed of submissions to Graphite # if there are lots of data points if number_of_datapoints > 4000: sleep(0.3) if pickle_data_sent: data_points_sent += tuples_added logger.info( 'worker :: sent %s/%s of %s data points to Graphite via pickle' % (str(tuples_added), str(data_points_sent), str(number_of_datapoints))) smallListOfMetricTuples = [] tuples_added = 0 else: logger.error( 'error :: worker :: failed to send %s data points to Graphite via pickle' % (str(tuples_added))) return False except Exception as e: logger.error( 'error :: worker :: error handling data in pickle_data - %s' % str(e)) return False if smallListOfMetricTuples: # @modified 20201207 - Task #3864: flux - try except everything try: tuples_to_send = len(smallListOfMetricTuples) pickle_data_sent = pickle_data_to_graphite( smallListOfMetricTuples) if pickle_data_sent: data_points_sent += tuples_to_send logger.info( 'worker :: sent the last %s/%s of %s data points to Graphite via pickle' % (str(tuples_to_send), str(data_points_sent), str(number_of_datapoints))) else: logger.error( 'error :: failed to send the last %s data points to Graphite via pickle' % (str(tuples_to_send))) return False except Exception as e: logger.error( 'error :: worker :: error in smallListOfMetricTuples pickle_data_to_graphite - %s' % str(e)) return False return True logger.info('worker :: starting worker') last_sent_to_graphite = int(time()) metrics_sent_to_graphite = 0 # @added 20200827 - Feature #3708: FLUX_ZERO_FILL_NAMESPACES last_zero_fill_to_graphite = 0 metrics_sent = [] remove_from_flux_queue_redis_set = [] # @added 20201019 - Feature #3790: flux - pickle to Graphite pickle_data = [] # send_to_reciever = 'line' send_to_reciever = 'pickle' # @modified 20201207 - Task #3864: flux - try except everything try: metric_data_queue_size = self.q.qsize() except Exception as e: logger.error( 'error :: worker :: could not determine metric_data_queue_size - %s' % str(e)) metric_data_queue_size = 0 if metric_data_queue_size > 10: send_to_reciever = 'pickle' # @added 202011120 - Feature #3790: flux - pickle to Graphite # Debug Redis set metrics_data_sent = [] # @added 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP # Even if flux.last Redis keys are disabled in flux they are used in # Vista vista_metrics = [] if not FLUX_CHECK_LAST_TIMESTAMP and VISTA_ENABLED: try: vista_metrics = list( self.redis_conn_decoded.sscan_iter('vista.metrics', match='*')) except: vista_metrics = [] # Populate API keys and tokens in memcache # python-2.x and python3.x handle while 1 and while True differently # while 1: running = True while running: # Make sure Redis is up redis_up = False while not redis_up: try: redis_up = self.redis_conn.ping() except: logger.error( 'worker :: cannot connect to redis at socket path %s' % (settings.REDIS_SOCKET_PATH)) sleep(2) # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Use get_redis_conn and get_redis_conn_decoded # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) # @modified 20201207 - Task #3864: flux - try except everything try: self.redis_conn = get_redis_conn(skyline_app) except Exception as e: logger.error( 'error :: worker :: could not get_redis_conn - %s' % str(e)) try: self.redis_conn_decoded = get_redis_conn_decoded( skyline_app) except Exception as e: logger.error( 'error :: worker :: could not get_redis_conn_decoded - %s' % str(e)) if LOCAL_DEBUG: try: metric_data_queue_size = self.q.qsize() logger.info( 'worker :: debug :: flux.httpMetricDataQueue queue size - %s' % str(metric_data_queue_size)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of queue flux.httpMetricDataQueue' ) metric_data = None try: # Get a metric from the queue with a 1 second timeout, each # metric item on the queue is a list e.g. # metric_data = [metricName, metricValue, metricTimestamp] metric_data = self.q.get(True, 1) except Empty: if pickle_data: # @modified 20201207 - Task #3864: flux - try except everything try: pickle_data_submitted = submit_pickle_data_to_graphite( pickle_data) except Exception as e: logger.error( 'error :: worker :: queue Empty failed to submit_pickle_data_to_graphite - %s' % str(e)) pickle_data_submitted = False if pickle_data_submitted: pickle_data = [] logger.info('worker :: queue is empty and timed out') sleep(1) # @added 20201017 - Feature #3788: snab_flux_load_test # Send to Graphite even if worker gets no metrics if (int(time()) - last_sent_to_graphite) >= 60: logger.info( 'worker :: metrics_sent_to_graphite in last 60 seconds - %s' % str(metrics_sent_to_graphite)) skyline_metric = '%s.metrics_sent_to_graphite' % skyline_app_graphite_namespace try: # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host # graphyte.send(skyline_metric, metrics_sent_to_graphite, time_now) send_graphite_metric(skyline_app, skyline_metric, metrics_sent_to_graphite) last_sent_to_graphite = int(time()) metrics_sent_to_graphite = 0 except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send_graphite_metric %s with %s' % (skyline_metric, str(metrics_sent_to_graphite))) metric_data_queue_size = 0 try: metric_data_queue_size = self.q.qsize() logger.info( 'worker :: flux.httpMetricDataQueue queue size - %s' % str(metric_data_queue_size)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of queue flux.httpMetricDataQueue' ) skyline_metric = '%s.httpMetricDataQueue.size' % skyline_app_graphite_namespace try: send_graphite_metric(skyline_app, skyline_metric, metric_data_queue_size) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send_graphite_metric %s with %s' % (skyline_metric, str(metrics_sent_to_graphite))) # @added 20201019 - Feature #3790: flux - pickle to Graphite if metric_data_queue_size > 10: send_to_reciever = 'pickle' else: send_to_reciever = 'line' send_to_reciever = 'pickle' # @added 202011120 - Feature #3790: flux - pickle to Graphite # Debug Redis set metrics_data_sent_strs = [] for item in metrics_data_sent: metrics_data_sent_strs.append(str(item)) if metrics_data_sent_strs: try: self.redis_conn.sadd('flux.metrics_data_sent', *set(metrics_data_sent_strs)) logger.info( 'worker :: added %s items to the flux.metrics_data_sent Redis set' % str(len(metrics_data_sent))) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of flux.queue Redis set' ) metrics_data_sent = [] try: new_set = 'aet.flux.metrics_data_sent.%s' % str( self.current_pid) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to current_pid for aet.flux.metrics_data_sent Redis set name' ) new_set = 'aet.flux.metrics_data_sent' try: self.redis_conn.rename('flux.metrics_data_sent', new_set) logger.info( 'worker :: renamed flux.metrics_data_sent Redis set to %s' % new_set) # @added 20201128 - Feature #3820: HORIZON_SHARDS # With metrics that come in at a frequency of less # than 60 seconds, it is possible that this key will # not exist as flux has not been sent metric data # so this operation will error with no such key except Exception as e: traceback_str = traceback.format_exc() if 'no such key' in e: logger.warn( 'warning :: worker :: failed to rename flux.metrics_data_sent to %s Redis set - flux has not recieved data in 60 seconds - %s' % (new_set, e)) else: logger.error(traceback_str) logger.error( 'error :: worker :: failed to rename flux.metrics_data_sent to %s Redis set' % new_set) try: self.redis_conn.expire(new_set, 600) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to set 600 seconds TTL on %s Redis set' % new_set) # @added 20201018 - Feature #3798: FLUX_PERSIST_QUEUE if FLUX_PERSIST_QUEUE: redis_set_size = 0 try: redis_set_size = self.redis_conn.scard( 'flux.queue') except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of flux.queue Redis set' ) logger.info( 'worker - flux.queue Redis set size of %s before removal of %s items' % (str(redis_set_size), str(len(remove_from_flux_queue_redis_set)))) if remove_from_flux_queue_redis_set: try: self.redis_conn.srem( 'flux.queue', *set(remove_from_flux_queue_redis_set)) remove_from_flux_queue_redis_set = [] except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to remove multiple items from flux.queue Redis set' ) try: redis_set_size = self.redis_conn.scard( 'flux.queue') except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of flux.queue Redis set' ) logger.info( 'worker - flux.queue Redis set size of %s after the removal of items' % (str(redis_set_size))) remove_from_flux_queue_redis_set = [] # @added 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP # Even if flux.last Redis keys are disabled in flux they are used in # Vista vista_metrics = [] if not FLUX_CHECK_LAST_TIMESTAMP and VISTA_ENABLED: try: vista_metrics = list( self.redis_conn_decoded.sscan_iter( 'vista.metrics', match='*')) except: vista_metrics = [] except NotImplementedError: pass except KeyboardInterrupt: logger.info( 'worker :: server has been issued a user signal to terminate - KeyboardInterrupt' ) except SystemExit: logger.info('worker :: server was interrupted - SystemExit') except Exception as e: logger.error('error :: worker :: %s' % (str(e))) # @added 20200206 - Feature #3444: Allow flux to backfill # Added backfill backfill = False # @added 20201018 - Feature #3798: FLUX_PERSIST_QUEUE if metric_data and FLUX_PERSIST_QUEUE: try: # Do not remove each individual metrics from the flux.queue # Redis set, add to a list that is removed in one srem *set # operation each 60 seconds. This is a more perfomant # method and requires a single blocking call for a batch of # metrics, rather than a blocking call for every metric. # self.redis_conn.srem('flux.queue', str(metric_data)) remove_from_flux_queue_redis_set.append(str(metric_data)) except: pass if metric_data: try: metric = str(metric_data[0]) value = float(metric_data[1]) timestamp = int(metric_data[2]) # @added 20200206 - Feature #3444: Allow flux to backfill # Added backfill backfill = int(metric_data[3]) if LOCAL_DEBUG: logger.info( 'worker :: debug :: queue item found - %s' % str(metric_data)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to interpolate metric, value, timestamp from metric_data - %s' % str(metric_data)) continue # @added 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP # Only check flux.last key if this is not backfill and # FLUX_CHECK_LAST_TIMESTAMP is enable or it is in VISTA_ENABLED cache_key = None # if FLUX_CHECK_LAST_TIMESTAMP: cache_key = 'flux.last.%s' % metric check_flux_last_key = False if not backfill and FLUX_CHECK_LAST_TIMESTAMP: check_flux_last_key = True if VISTA_ENABLED: if metric in vista_metrics: check_flux_last_key = True if settings.FLUX_SEND_TO_CARBON: # Best effort de-duplicate the data valid_data = True # @added 20200818 - Feature #3694: flux - POST multiple metrics # Handle Redis and literal_eval separately redis_last_metric_data = None # @modified 20200206 - Feature #3444: Allow flux to backfill # Only check flux.last key if this is not backfill # @modified 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP # Use the check_flux_last_key value determined above # if not backfill: if check_flux_last_key: # @modified 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP # Set cache_key outside the conditional block # cache_key = 'flux.last.%s' % metric last_metric_timestamp = None try: # @modified 20191128 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # redis_last_metric_data = self.redis_conn.get(cache_key) redis_last_metric_data = self.redis_conn_decoded.get( cache_key) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine last_metric_timestamp from Redis key %s' % str(cache_key)) redis_last_metric_data = None # @modified 20200818 - Feature #3694: flux - POST multiple metrics # Handle Redis and literal_eval separately, only # literal_eval if Redis had data for the key if redis_last_metric_data: try: last_metric_data = literal_eval( redis_last_metric_data) last_metric_timestamp = int( last_metric_data[0]) if LOCAL_DEBUG: logger.info( 'worker :: debug :: last_metric_timestamp for %s from %s is %s' % (metric, str(cache_key), str(last_metric_timestamp))) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine last_metric_timestamp from Redis key %s' % str(cache_key)) last_metric_timestamp = False if last_metric_timestamp: if timestamp <= last_metric_timestamp: valid_data = False if LOCAL_DEBUG: logger.info( 'worker :: debug :: not valid data - the queue data timestamp %s is <= to the last_metric_timestamp %s for %s' % (str(timestamp), str(last_metric_timestamp), metric)) if valid_data: submittedToGraphite = False if send_to_reciever == 'line': try: graphyte.send(metric, value, timestamp) submittedToGraphite = True # modified 20201016 - Feature #3788: snab_flux_load_test if FLUX_VERBOSE_LOGGING: logger.info( 'worker :: sent %s, %s, %s to Graphite - via graphyte' % (str(metric), str(value), str(timestamp))) metrics_sent_to_graphite += 1 # @added 20200827 - Feature #3708: FLUX_ZERO_FILL_NAMESPACES metrics_sent.append(metric) # @added 202011120 - Feature #3790: flux - pickle to Graphite # Debug Redis set metrics_data_sent.append( [metric, value, timestamp]) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send metric data to Graphite for %s' % str(metric)) metric = None if send_to_reciever == 'pickle': # @modified 20201212 - Task #3864: flux - try except everything try: tuple_data = (metric, (int(timestamp), float(value))) pickle_data.append(tuple_data) if FLUX_VERBOSE_LOGGING: logger.info( 'worker :: sending %s, %s, %s to Graphite - via pickle' % (str(metric), str(value), str(timestamp))) submittedToGraphite = True metrics_sent_to_graphite += 1 metrics_sent.append(metric) # @added 202011120 - Feature #3790: flux - pickle to Graphite # Debug Redis set metrics_data_sent.append( [metric, value, timestamp]) except Exception as e: logger.error( 'error :: worker :: failed to append to pickle_data - %s' % str(e)) if submittedToGraphite: # Update the metric Redis flux key # @modified 20200206 - Feature #3444: Allow flux to backfill # Only update the flux.last key if this is not backfill # @modified 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP # Use the check_flux_last_key value determined above # if not backfill: if check_flux_last_key: metric_data = [timestamp, value] # @modified 20201207 - Task #3864: flux - try except everything try: self.redis_conn.set( cache_key, str(metric_data)) except Exception as e: logger.error( 'error :: worker :: failed to set check_flux_last_key Redis key - %s' % str(e)) # @added 20200213 - Bug #3448: Repeated airgapped_metrics else: # @added 20201120 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP # Feature #3400: Identify air gaps in the metric data # Only execute if IDENTIFY_AIRGAPS is enabled if IDENTIFY_AIRGAPS: # @added 20200213 - Bug #3448: Repeated airgapped_metrics # Add a flux.filled key to Redis with a expiry # set to FULL_DURATION so that Analyzer knows to # sort and deduplicate the Redis time series # data as carbon-relay will send it to Horizon # and the datapoints will be out of order in the # Redis key try: flux_filled_key = 'flux.filled.%s' % str( metric) self.redis_conn.setex( flux_filled_key, settings.FULL_DURATION, int(time())) logger.info( 'worker :: set Redis key %s' % (str(flux_filled_key))) except Exception as e: logger.error( 'error :: failed to could not set Redis flux.filled key: %s' % e) else: # modified 20201016 - Feature #3788: snab_flux_load_test if FLUX_VERBOSE_LOGGING: logger.info( 'worker :: discarded %s, %s, %s as a data point for %s has already been submitted to Graphite' % (str(metric), str(value), str(timestamp), str(timestamp))) else: logger.info( 'worker :: settings.FLUX_SEND_TO_CARBON is set to %s, discarded %s, %s, %s' % (str(settings.FLUX_SEND_TO_CARBON), str(metric), str(value), str(timestamp))) if settings.FLUX_SEND_TO_STATSD: statsd_conn.incr(metric, value, timestamp) # modified 20201016 - Feature #3788: snab_flux_load_test if FLUX_VERBOSE_LOGGING: logger.info('worker sent %s, %s, %s to statsd' % (metric, str(value), str(timestamp))) # @added 20200827 - Feature #3708: FLUX_ZERO_FILL_NAMESPACES metrics_sent.append(metric) submit_pickle_data = False if pickle_data: number_of_datapoints = len(pickle_data) if number_of_datapoints >= 1000: submit_pickle_data = True else: try: metric_data_queue_size = self.q.qsize() except: metric_data_queue_size = 0 if metric_data_queue_size == 0: submit_pickle_data = True if submit_pickle_data: # @modified 20201207 - Task #3864: flux - try except everything try: pickle_data_submitted = submit_pickle_data_to_graphite( pickle_data) except Exception as e: logger.error( 'error :: worker :: submit_pickle_data_to_graphite failed - %s' % str(e)) pickle_data_submitted = False if pickle_data_submitted: pickle_data = [] time_now = int(time()) # @added 20200827 - Feature #3708: FLUX_ZERO_FILL_NAMESPACES # Send 0 for any metric in the flux.zero_fill_metrics Redis set that # has not submitted data in the last 60 seconds. The flux.last # Redis key is not updated for these sent 0 values so if the source # sends data for a timestamp in the period later (due to a lag, etc), # it will be valid and sent to Graphite. if FLUX_ZERO_FILL_NAMESPACES: if not last_zero_fill_to_graphite: last_zero_fill_to_graphite = time_now - 60 if (time_now - last_sent_to_graphite) >= 60: try: flux_zero_fill_metrics = list( self.redis_conn_decoded.smembers( 'flux.zero_fill_metrics')) except: logger.info(traceback.format_exc()) logger.error( 'error :: failed to generate a list from flux.zero_fill_metrics Redis set' ) for flux_zero_fill_metric in flux_zero_fill_metrics: if flux_zero_fill_metric not in metrics_sent: try: graphyte.send(flux_zero_fill_metric, 0.0, time_now) # modified 20201016 - Feature #3788: snab_flux_load_test if FLUX_VERBOSE_LOGGING: logger.info( 'worker :: zero fill - sent %s, %s, %s to Graphite' % (str(flux_zero_fill_metric), str(0.0), str(time_now))) metrics_sent_to_graphite += 1 metrics_sent.append(metric) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: zero fill - failed to send metric data to Graphite for %s' % str(flux_zero_fill_metric)) metric = None last_zero_fill_to_graphite = time_now metrics_sent = [] if (time_now - last_sent_to_graphite) >= 60: if pickle_data: # @modified 20201207 - Task #3864: flux - try except everything try: pickle_data_submitted = submit_pickle_data_to_graphite( pickle_data) except Exception as e: logger.error( 'error :: worker :: submit_pickle_data_to_graphite failed last_sent_to_graphite >= 60 - %s' % str(e)) pickle_data_submitted = False if pickle_data_submitted: pickle_data = [] logger.info( 'worker :: metrics_sent_to_graphite in last 60 seconds - %s' % str(metrics_sent_to_graphite)) skyline_metric = '%s.metrics_sent_to_graphite' % skyline_app_graphite_namespace try: # @modified 20191008 - Feature #3250: Allow Skyline to send metrics to another Carbon host # graphyte.send(skyline_metric, metrics_sent_to_graphite, time_now) send_graphite_metric(skyline_app, skyline_metric, metrics_sent_to_graphite) last_sent_to_graphite = int(time()) metrics_sent_to_graphite = 0 except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send_graphite_metric %s with %s' % (skyline_metric, str(metrics_sent_to_graphite))) metric_data_queue_size = 0 try: metric_data_queue_size = self.q.qsize() logger.info( 'worker :: flux.httpMetricDataQueue queue size - %s' % str(metric_data_queue_size)) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of queue flux.httpMetricDataQueue' ) skyline_metric = '%s.httpMetricDataQueue.size' % skyline_app_graphite_namespace try: send_graphite_metric(skyline_app, skyline_metric, metric_data_queue_size) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to send_graphite_metric %s with %s' % (skyline_metric, str(metrics_sent_to_graphite))) # @added 20201019 - Feature #3790: flux - pickle to Graphite if metric_data_queue_size > 10: send_to_reciever = 'pickle' else: send_to_reciever = 'line' # @added 202011120 - Feature #3790: flux - pickle to Graphite # Debug Redis set metrics_data_sent_strs = [] for item in metrics_data_sent: metrics_data_sent_strs.append(str(item)) if metrics_data_sent_strs: try: self.redis_conn.sadd('flux.metrics_data_sent', *set(metrics_data_sent_strs)) logger.info( 'worker :: added %s items to the flux.metrics_data_sent Redis set' % str(len(metrics_data_sent))) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of flux.queue Redis set' ) metrics_data_sent = [] try: new_set = 'aet.flux.metrics_data_sent.%s' % str( self.current_pid) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to current_pid for aet.flux.metrics_data_sent Redis set name' ) new_set = 'aet.flux.metrics_data_sent' try: self.redis_conn.rename('flux.metrics_data_sent', new_set) logger.info( 'worker :: renamed flux.metrics_data_sent Redis set to %s' % new_set) # @modified 20201128 - Feature #3820: HORIZON_SHARDS # With metrics that come in at a frequency of less # than 60 seconds, it is possible that this key will # not exist as flux has not been sent metric data # so this operation will error with no such key except Exception as e: traceback_str = traceback.format_exc() if 'no such key' in e: logger.warn( 'warning :: worker :: failed to rename flux.metrics_data_sent to %s Redis set - flux has not recieved data in 60 seconds - %s' % (new_set, e)) else: logger.error(traceback_str) logger.error( 'error :: worker :: failed to rename flux.metrics_data_sent to %s Redis set' % new_set) try: self.redis_conn.expire(new_set, 600) except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to set 600 seconds TTL on %s Redis set' % new_set) # @added 20201018 - Feature #3798: FLUX_PERSIST_QUEUE if FLUX_PERSIST_QUEUE: redis_set_size = 0 try: redis_set_size = self.redis_conn.scard('flux.queue') except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of flux.queue Redis set' ) logger.info( 'worker - flux.queue Redis set size %s before removal of %s items' % (str(redis_set_size), str(len(remove_from_flux_queue_redis_set)))) if remove_from_flux_queue_redis_set: try: self.redis_conn.srem( 'flux.queue', *set(remove_from_flux_queue_redis_set)) remove_from_flux_queue_redis_set = [] except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to remove multiple items from flux.queue Redis set' ) try: redis_set_size = self.redis_conn.scard( 'flux.queue') except: logger.error(traceback.format_exc()) logger.error( 'error :: worker :: failed to determine size of flux.queue Redis set' ) logger.info( 'worker - flux.queue Redis set size of %s after the removal of items' % (str(redis_set_size))) remove_from_flux_queue_redis_set = [] # @added 20201020 - Feature #3796: FLUX_CHECK_LAST_TIMESTAMP # Even if flux.last Redis keys are disabled in flux they are used in # Vista vista_metrics = [] if not FLUX_CHECK_LAST_TIMESTAMP and VISTA_ENABLED: try: vista_metrics = list( self.redis_conn_decoded.sscan_iter('vista.metrics', match='*')) except: vista_metrics = []
def run(self): """ Called when the process intializes. """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: logger.info('removing %s' % skyline_app_logwait) os.remove(skyline_app_logwait) except OSError: logger.error('error :: failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error( 'error :: bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os.remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error :: failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) # @added 20190417 - Feature #2948: LUMINOSITY_ENABLED setting # If Luminosity is not enabled, do nothing luminosity_enabled = True try: luminosity_enabled = settings.LUMINOSITY_ENABLED logger.info('LUMINOSITY_ENABLED is set to %s' % str(luminosity_enabled)) except: logger.info( 'warning :: LUMINOSITY_ENABLED is not declared in settings.py, defaults to True' ) # @added 20190417 - Feature #2950: Report defaulted settings to log # Added all the globally declared settings to enable reporting in the # log the state of each setting. try: ENABLE_LUMINOSITY_DEBUG = settings.ENABLE_LUMINOSITY_DEBUG logger.info( 'ENABLE_LUMINOSITY_DEBUG is set from settings.py to %s' % str(ENABLE_LUMINOSITY_DEBUG)) except: logger.info( 'warning :: ENABLE_LUMINOSITY_DEBUG is not declared in settings.py, defaults to False' ) ENABLE_LUMINOSITY_DEBUG = False try: SERVER_METRIC_PATH = '.%s' % settings.SERVER_METRICS_NAME if SERVER_METRIC_PATH == '.': SERVER_METRIC_PATH = '' logger.info('SERVER_METRIC_PATH is set from settings.py to %s' % str(SERVER_METRIC_PATH)) except: SERVER_METRIC_PATH = '' logger.info( 'warning :: SERVER_METRIC_PATH is not declared in settings.py, defaults to \'\'' ) try: LUMINOSITY_PROCESSES = settings.LUMINOSITY_PROCESSES logger.info('LUMINOSITY_PROCESSES is set from settings.py to %s' % str(LUMINOSITY_PROCESSES)) except: # @modified 20180110 - Task #2266: Evaluate luminol for the luminosity branch # It is fast and lightweight # luminosity_processes = 2 LUMINOSITY_PROCESSES = 1 logger.info( 'warning :: cannot determine LUMINOSITY_PROCESSES from settings.py, defaults to %s' % str(LUMINOSITY_PROCESSES)) while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() if ENABLE_LUMINOSITY_DEBUG: logger.info('debug :: connected to Redis') except: logger.error( 'error :: cannot connect to redis at socket path %s' % (settings.REDIS_SOCKET_PATH)) sleep(30) # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow # @modified 20191115 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # Use get_redis_conn and get_redis_conn_decoded to use on Redis sets when the bytes # types need to be decoded as utf-8 to str # if settings.REDIS_PASSWORD: # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) # else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) # @added 20191115 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 self.redis_conn = get_redis_conn(skyline_app) self.redis_conn_decoded = get_redis_conn_decoded(skyline_app) continue # Report app up try: self.redis_conn.setex(skyline_app, 120, now) logger.info('updated Redis key for %s up' % skyline_app) except: logger.error('error :: failed to update Redis key for %s up' % skyline_app) # @added 20190417 - Feature #: LUMINOSITY_ENABLED setting # If Luminosity is not enabled, do nothing if not luminosity_enabled: logger.info( 'luminosity is not enabled LUMINOSITY_ENABLED set to %s, sleeping for 20 seconds' % str(settings.LUMINOSITY_ENABLED)) sleep(20) continue """ Determine if any new anomalies have been added """ while True: process_anomaly_id = None last_processed_anomaly_id = None memcache_last_processed_anomaly_id_data = False # Check memcached before MySQL memcache_key = '%s.last.processed.anomaly.id' % skyline_app if settings.MEMCACHE_ENABLED: try: # @modified 20191029 - Task #3304: py3 - handle pymemcache bytes not str # last_processed_anomaly_id = self.memcache_client.get(memcache_key) if python_version == 2: last_processed_anomaly_id = self.memcache_client.get( memcache_key) else: last_processed_anomaly_id = self.memcache_client.get( memcache_key).decode('utf-8') # if memcache does not have the key the response to the # client is None, it does not except except: # @modified 20200507 - stop reporting this as an error # it can be expected to happen from time to time # logger.error('error :: failed to get %s from memcache' % memcache_key) logger.info( 'failed to get %s from memcache, will query DB' % memcache_key) try: self.memcache_client.close() except: logger.error( 'error :: failed to close memcache_client') if last_processed_anomaly_id: logger.info( 'last_processed_anomaly_id found in memcache - %s' % str(last_processed_anomaly_id)) memcache_last_processed_anomaly_id_data = True else: # @modified 20190517 - Bug #3016: Handle no anomaly ids in luminosity # Branch #3002: docker # Log appropriate to whether memcache is enabled or not if settings.MEMCACHE_ENABLED: logger.info( 'last_processed_anomaly_id key was NOT found in memcache - %s' % str(last_processed_anomaly_id)) else: logger.info( 'memcache not enabled not checking for last_processed_anomaly_id key' ) if not last_processed_anomaly_id: query = 'SELECT id FROM luminosity WHERE id=(SELECT MAX(id) FROM luminosity) ORDER BY id DESC LIMIT 1' results = None try: results = mysql_select(skyline_app, query) except: logger.error(traceback.format_exc()) logger.error('error :: MySQL quey failed - %s' % query) if results: try: last_processed_anomaly_id = int(results[0][0]) logger.info( 'last_processed_anomaly_id found from DB - %s' % str(last_processed_anomaly_id)) except: logger.error(traceback.format_exc()) if last_processed_anomaly_id and settings.MEMCACHE_ENABLED: if not memcache_last_processed_anomaly_id_data: logger.info( 'Populating memcache with DB result - %s' % str(last_processed_anomaly_id)) try: self.memcache_client.set( memcache_key, int(last_processed_anomaly_id)) logger.info( 'populated memcache key %s with %s' % (memcache_key, str(last_processed_anomaly_id))) except: logger.error( 'error :: failed to set the memcache key - %s - %s' % (memcache_key, str(last_processed_anomaly_id))) try: self.memcache_client.close() except: logger.error( 'error :: failed to close memcache_client' ) if not last_processed_anomaly_id: # Check MySQL now = int(time()) after = now - 600 query = 'SELECT * FROM anomalies WHERE anomaly_timestamp > \'%s\'' % str( after) # nosec results = None try: results = mysql_select(skyline_app, query) except: logger.error('error :: MySQL quey failed - %s' % query) if results: process_anomaly_id = int(results[0][0]) logger.info( 'found new anomaly id to process from the DB - %s' % str(process_anomaly_id)) # Handle the first one last_processed_anomaly_id = process_anomaly_id - 1 else: logger.info('no new anomalies in the anomalies table') # @added 20190517 - Bug #3016: Handle no anomaly ids in luminosity # Branch #3002: docker # When Skyline is first installed, if luminosity is enabled it # reports errors as there are no anomaly ids if str(last_processed_anomaly_id) == 'None': last_processed_anomaly_id = 0 query = 'SELECT * FROM anomalies WHERE id > \'%s\'' % str( last_processed_anomaly_id) # nosec results = None try: results = mysql_select(skyline_app, query) except: logger.error('error :: MySQL quey failed - %s' % query) if results: try: process_anomaly_id = int(results[0][0]) logger.info( 'found the next new anomaly id to process from the DB - %s' % str(process_anomaly_id)) except: logger.error(traceback.format_exc()) logger.error('error :: from query - %s' % query) else: logger.info('no new anomalies in the anomalies table') if process_anomaly_id and last_processed_anomaly_id: if isinstance(last_processed_anomaly_id, int): if isinstance(process_anomaly_id, int): if last_processed_anomaly_id == process_anomaly_id: logger.info( 'anomaly id already processed - %s' % str(process_anomaly_id)) process_anomaly_id = None if not process_anomaly_id: logger.info( 'sleeping 20 no anomalies to correlate - last processed anomaly id - %s' % str(last_processed_anomaly_id)) sleep(20) up_now = time() # Report app up try: self.redis_conn.setex(skyline_app, 120, up_now) logger.info('updated Redis key for %s up' % skyline_app) except: logger.error( 'error :: failed to update Redis key for %s up' % skyline_app) cache_key = '%s.sent_graphite_metrics' % skyline_app redis_sent_graphite_metrics = False try: redis_sent_graphite_metrics = self.redis_conn.get( cache_key) except Exception as e: logger.error( 'error :: could not query Redis for key %s: %s' % (cache_key, e)) # Flush metrics to Graphite if not redis_sent_graphite_metrics: try: # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage # correlations = str(len(self.correlations)) # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # correlations = str(len(list(self.redis_conn.smembers('luminosity.correlations')))) correlations = str( len( list( self.redis_conn_decoded.smembers( 'luminosity.correlations')))) except: correlations = '0' logger.info('correlations :: %s' % correlations) send_metric_name = '%s.correlations' % skyline_app_graphite_namespace send_graphite_metric(skyline_app, send_metric_name, correlations) # @added 20190522 - Task #3034: Reduce multiprocessing Manager list usage try: # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # runtimes = list(self.redis_conn.smembers('luminosity.runtimes')) runtimes = list( self.redis_conn_decoded.smembers( 'luminosity.runtimes')) except: runtimes = [] # @added 20180720 - Task #2462: Implement useful metrics for Luminosity # Branch #2270: luminosity # runtime metric to monitor the time it takes to process # correlations try: # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage # if len(self.runtimes) > 1: # avg_runtime = sum(self.runtimes) / len(self.runtimes) # else: # avg_runtime = sum(self.runtimes) if len(runtimes) > 1: avg_runtime = sum(runtimes) / len(runtimes) else: avg_runtime = sum(runtimes) except: avg_runtime = '0' logger.info('avg_runtime :: %s' % str(avg_runtime)) send_metric_name = '%s.avg_runtime' % skyline_app_graphite_namespace send_graphite_metric(skyline_app, send_metric_name, str(avg_runtime)) try: # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage # metrics_checked_for_correlation = str(sum(self.metrics_checked_for_correlation)) # @modified 20191030 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # metrics_checked_for_correlation = str(len(list(self.redis_conn.smembers('luminosity.metrics_checked_for_correlation')))) metrics_checked_for_correlation = str( len( list( self.redis_conn_decoded.smembers( 'luminosity.metrics_checked_for_correlation' )))) except: metrics_checked_for_correlation = '0' logger.info('metrics_checked_for_correlation :: %s' % metrics_checked_for_correlation) send_metric_name = '%s.metrics_checked_for_correlation' % skyline_app_graphite_namespace send_graphite_metric(skyline_app, send_metric_name, metrics_checked_for_correlation) sent_graphite_metrics_now = int(time()) try: self.redis_conn.setex(cache_key, 59, sent_graphite_metrics_now) logger.info('updated Redis key - %s' % cache_key) except: logger.error( 'error :: failed to update Redis key - %s up' % cache_key) # Reset lists # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage # self.correlations[:] = [] # @added 20180720 - Task #2462: Implement useful metrics for Luminosity # @modified 20190522 - Task #3034: Reduce multiprocessing Manager list usage # self.runtimes[:] = [] # self.metrics_checked_for_correlation[:] = [] # @added 20190522 - Task #3034: Reduce multiprocessing Manager list usage # Use Redis sets instead of Manager().list() delete_redis_sets = [ 'luminosity.correlations', 'luminosity.runtimes', 'luminosity.metrics_checked_for_correlation' ] for i_redis_set in delete_redis_sets: redis_set_to_delete = i_redis_set try: self.redis_conn.delete(redis_set_to_delete) logger.info('deleted Redis set - %s' % redis_set_to_delete) except: logger.error(traceback.format_exc()) logger.error( 'error :: failed to delete Redis set - %s' % redis_set_to_delete) # @added 20180720 - Task #2462: Implement useful metrics for Luminosity # Feature #2464: luminosity_remote_data # Added the ability to add a Redis key to overview the memcached # key luminosity.last.processed.anomaly.id some it does not have # to be changed via telnet to memcache. if not process_anomaly_id or not redis_sent_graphite_metrics: cache_key = '%s.last.processed.anomaly.id' % skyline_app redis_last_processed_anomaly_id_redis_key = False try: redis_last_processed_anomaly_id_redis_key = self.redis_conn.get( cache_key) except Exception as e: logger.error( 'error :: could not query Redis for key %s: %s' % (cache_key, e)) if redis_last_processed_anomaly_id_redis_key: logger.info( 'found Redis %s key to override the mecache key setting process_anomaly_id to %s' % (cache_key, str(redis_last_processed_anomaly_id_redis_key))) try: process_anomaly_id = int( redis_last_processed_anomaly_id_redis_key) except: logger.error(traceback.format_exc()) logger.error( 'error :: failed to set process_anomaly_id from Rdis override key value' ) # And remove the Redis override key as it is only meant # to override once to allow for a replay for debug # purposes only. try: self.redis_conn.setex( cache_key, 1, int(redis_last_processed_anomaly_id_redis_key)) logger.info('updated Redis key - %s' % cache_key) except: logger.error( 'error :: failed to update Redis key - %s up to 1 second expiring to delete it.' % cache_key) if process_anomaly_id: break # Spawn process logger.info('spawning processes to correlate anomaly id %s' % str(process_anomaly_id)) pids = [] spawned_pids = [] pid_count = 0 now = time() for i in range(1, LUMINOSITY_PROCESSES + 1): try: p = Process(target=self.spin_process, args=(i, process_anomaly_id)) pids.append(p) pid_count += 1 logger.info('starting %s of %s spin_process/es' % (str(pid_count), str(LUMINOSITY_PROCESSES))) p.start() spawned_pids.append(p.pid) except: logger.error(traceback.format_exc()) logger.error('error :: failed to start spin_process') continue # Self monitor processes and terminate if any spin_process has run # for to long p_starts = time() while time() - p_starts <= 60: if any(p.is_alive() for p in pids): # Just to avoid hogging the CPU sleep(.1) else: # All the processes are done, break now. time_to_run = time() - p_starts logger.info('%s spin_process completed in %.2f seconds' % (str(LUMINOSITY_PROCESSES), time_to_run)) break else: # We only enter this if we didn't 'break' above. logger.info('timed out, killing all spin_process processes') for p in pids: try: p.terminate() # p.join() logger.info('killed spin_process process') except: logger.error(traceback.format_exc()) logger.error( 'error :: killing all spin_process processes') for p in pids: if p.is_alive(): logger.info('stopping spin_process - %s' % (str(p.is_alive()))) p.join() process_runtime = time() - now if process_runtime < 10: sleep_for = (10 - process_runtime) logger.info( 'sleeping for %.2f seconds due to low run time...' % sleep_for) sleep(sleep_for) try: del sleep_for except: logger.error('error :: failed to del sleep_for') try: del process_runtime except: logger.error('error :: failed to del process_runtime')
def run(self): """ Called when the process intializes. """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: os.remove(skyline_app_logwait) except OSError: logger.error('error :: failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error('error :: bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os.remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error :: failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() except: logger.error('error :: skyline cannot connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH) sleep(10) self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) continue # Report app up self.redis_conn.setex(skyline_app, 120, now) # Discover unique metrics unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics')) if len(unique_metrics) == 0: logger.info('no metrics in redis. try adding some - see README') sleep(10) continue # Reset boundary_metrics boundary_metrics = [] # Build boundary metrics for metric_name in unique_metrics: for metric in BOUNDARY_METRICS: CHECK_MATCH_PATTERN = metric[0] check_match_pattern = re.compile(CHECK_MATCH_PATTERN) base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1) pattern_match = check_match_pattern.match(base_name) if pattern_match: if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: boundary metric - pattern MATCHED - ' + metric[0] + " | " + base_name) boundary_metrics.append([metric_name, metric[1]]) if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: boundary metrics - ' + str(boundary_metrics)) if len(boundary_metrics) == 0: logger.info('no Boundary metrics in redis. try adding some - see README') sleep(10) continue # Spawn processes pids = [] for i in range(1, settings.BOUNDARY_PROCESSES + 1): if i > len(boundary_metrics): logger.info('WARNING: Skyline Boundary is set for more cores than needed.') break p = Process(target=self.spin_process, args=(i, boundary_metrics)) pids.append(p) p.start() # Send wait signal to zombie processes for p in pids: p.join() # Grab data from the queue and populate dictionaries exceptions = dict() anomaly_breakdown = dict() while 1: try: key, value = self.anomaly_breakdown_q.get_nowait() if key not in anomaly_breakdown.keys(): anomaly_breakdown[key] = value else: anomaly_breakdown[key] += value except Empty: break while 1: try: key, value = self.exceptions_q.get_nowait() if key not in exceptions.keys(): exceptions[key] = value else: exceptions[key] += value except Empty: break # Send alerts if settings.BOUNDARY_ENABLE_ALERTS: for anomalous_metric in self.anomalous_metrics: datapoint = str(anomalous_metric[0]) metric_name = anomalous_metric[1] base_name = metric_name.replace(FULL_NAMESPACE, '', 1) expiration_time = str(anomalous_metric[2]) metric_trigger = str(anomalous_metric[5]) alert_threshold = int(anomalous_metric[6]) metric_alerters = anomalous_metric[7] algorithm = anomalous_metric[8] if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: anomalous_metric - " + str(anomalous_metric)) # Determine how many times has the anomaly been seen if the # ALERT_THRESHOLD is set to > 1 and create a cache key in # redis to keep count so that alert_threshold can be honored if alert_threshold == 0: times_seen = 1 if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: alert_threshold - " + str(alert_threshold)) if alert_threshold == 1: times_seen = 1 if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: alert_threshold - " + str(alert_threshold)) if alert_threshold > 1: if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: alert_threshold - ' + str(alert_threshold)) anomaly_cache_key_count_set = False anomaly_cache_key_expiration_time = (int(alert_threshold) + 1) * 60 anomaly_cache_key = 'anomaly_seen.%s.%s' % (algorithm, base_name) try: anomaly_cache_key_count = self.redis_conn.get(anomaly_cache_key) if not anomaly_cache_key_count: try: if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: redis no anomaly_cache_key - ' + str(anomaly_cache_key)) times_seen = 1 if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: redis setex anomaly_cache_key - ' + str(anomaly_cache_key)) self.redis_conn.setex(anomaly_cache_key, anomaly_cache_key_expiration_time, packb(int(times_seen))) logger.info('set anomaly seen key :: %s seen %s' % (anomaly_cache_key, str(times_seen))) except Exception as e: logger.error('error :: redis setex failed :: %s' % str(anomaly_cache_key)) logger.error('error :: could not set key: %s' % e) else: if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: redis anomaly_cache_key retrieved OK - ' + str(anomaly_cache_key)) anomaly_cache_key_count_set = True except: if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: redis failed - anomaly_cache_key retrieval failed - ' + str(anomaly_cache_key)) anomaly_cache_key_count_set = False if anomaly_cache_key_count_set: unpacker = Unpacker(use_list=False) unpacker.feed(anomaly_cache_key_count) raw_times_seen = list(unpacker) times_seen = int(raw_times_seen[0]) + 1 try: self.redis_conn.setex(anomaly_cache_key, anomaly_cache_key_expiration_time, packb(int(times_seen))) logger.info('error :: set anomaly seen key :: %s seen %s' % (anomaly_cache_key, str(times_seen))) except: times_seen = 1 logger.error('error :: set anomaly seen key failed :: %s seen %s' % (anomaly_cache_key, str(times_seen))) # Alert the alerters if times_seen > alert_threshold if times_seen >= alert_threshold: if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: times_seen %s is greater than or equal to alert_threshold %s' % (str(times_seen), str(alert_threshold))) for alerter in metric_alerters.split("|"): # Determine alerter limits send_alert = False alerts_sent = 0 if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: checking alerter - %s' % alerter) try: if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: determining alerter_expiration_time for settings') alerter_expiration_time_setting = settings.BOUNDARY_ALERTER_OPTS['alerter_expiration_time'][alerter] alerter_expiration_time = int(alerter_expiration_time_setting) if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: determined alerter_expiration_time from settings - %s' % str(alerter_expiration_time)) except: # Set an arbitrary expiry time if not set alerter_expiration_time = 160 if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: could not determine alerter_expiration_time from settings") try: if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: determining alerter_limit from settings") alerter_limit_setting = settings.BOUNDARY_ALERTER_OPTS['alerter_limit'][alerter] alerter_limit = int(alerter_limit_setting) alerter_limit_set = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: determined alerter_limit from settings - %s" % str(alerter_limit)) except: alerter_limit_set = False send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: could not determine alerter_limit from settings") # If the alerter_limit is set determine how many # alerts the alerter has sent if alerter_limit_set: alerter_sent_count_key = 'alerts_sent.%s' % (alerter) try: alerter_sent_count_key_data = self.redis_conn.get(alerter_sent_count_key) if not alerter_sent_count_key_data: if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: redis no alerter key, no alerts sent for - " + str(alerter_sent_count_key)) alerts_sent = 0 send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: alerts_sent set to %s" % str(alerts_sent)) logger.info("debug :: send_alert set to %s" % str(sent_alert)) else: if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: redis alerter key retrieved, unpacking" + str(alerter_sent_count_key)) unpacker = Unpacker(use_list=False) unpacker.feed(alerter_sent_count_key_data) raw_alerts_sent = list(unpacker) alerts_sent = int(raw_alerts_sent[0]) if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: alerter %s alerts sent %s " % (str(alerter), str(alerts_sent))) except: logger.info("No key set - %s" % alerter_sent_count_key) alerts_sent = 0 send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: alerts_sent set to %s" % str(alerts_sent)) logger.info("debug :: send_alert set to %s" % str(send_alert)) if alerts_sent < alerter_limit: send_alert = True if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: alerts_sent %s is less than alerter_limit %s" % (str(alerts_sent), str(alerter_limit))) logger.info("debug :: send_alert set to %s" % str(send_alert)) # Send alert alerter_alert_sent = False if send_alert: cache_key = 'last_alert.boundary.%s.%s.%s' % (alerter, base_name, algorithm) if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: checking cache_key - %s" % cache_key) try: last_alert = self.redis_conn.get(cache_key) if not last_alert: try: self.redis_conn.setex(cache_key, int(anomalous_metric[2]), packb(int(anomalous_metric[0]))) if ENABLE_BOUNDARY_DEBUG: logger.info('debug :: key setex OK - %s' % (cache_key)) trigger_alert(alerter, datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via %s - %s' % (base_name, datapoint, alerter, algorithm)) trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) alerter_alert_sent = True except Exception as e: logger.error('error :: alert failed :: %s - %s - via %s - %s' % (base_name, datapoint, alerter, algorithm)) logger.error('error :: could not send alert: %s' % str(e)) trigger_alert('syslog', datapoint, base_name, expiration_time, metric_trigger, algorithm) else: if ENABLE_BOUNDARY_DEBUG: logger.info("debug :: cache_key exists not alerting via %s for %s is less than alerter_limit %s" % (alerter, cache_key)) trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) except: trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) else: trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) # Update the alerts sent for the alerter cache key, # to allow for alert limiting if alerter_alert_sent and alerter_limit_set: try: alerter_sent_count_key = 'alerts_sent.%s' % (alerter) new_alerts_sent = int(alerts_sent) + 1 self.redis_conn.setex(alerter_sent_count_key, alerter_expiration_time, packb(int(new_alerts_sent))) logger.info('set %s - %s' % (alerter_sent_count_key, str(new_alerts_sent))) except: logger.error('error :: failed to set %s - %s' % (alerter_sent_count_key, str(new_alerts_sent))) else: # Always alert to syslog, even if alert_threshold is not # breached or if send_alert is not True trigger_alert("syslog", datapoint, base_name, expiration_time, metric_trigger, algorithm) logger.info('alert sent :: %s - %s - via syslog - %s' % (base_name, datapoint, algorithm)) # Write anomalous_metrics to static webapp directory if len(self.anomalous_metrics) > 0: filename = path.abspath(path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP)) with open(filename, 'w') as fh: # Make it JSONP with a handle_data() function anomalous_metrics = list(self.anomalous_metrics) anomalous_metrics.sort(key=operator.itemgetter(1)) fh.write('handle_data(%s)' % anomalous_metrics) run_time = time() - now total_metrics = str(len(boundary_metrics)) total_analyzed = str(len(boundary_metrics) - sum(exceptions.values())) total_anomalies = str(len(self.anomalous_metrics)) # Log progress logger.info('seconds to run :: %.2f' % run_time) logger.info('total metrics :: %s' % total_metrics) logger.info('total analyzed :: %s' % total_analyzed) logger.info('total anomalies :: %s' % total_anomalies) logger.info('exception stats :: %s' % exceptions) logger.info('anomaly breakdown :: %s' % anomaly_breakdown) # Log to Graphite graphite_run_time = '%.2f' % run_time send_metric_name = skyline_app_graphite_namespace + '.run_time' send_graphite_metric(skyline_app, send_metric_name, graphite_run_time) send_metric_name = skyline_app_graphite_namespace + '.total_analyzed' send_graphite_metric(skyline_app, send_metric_name, total_analyzed) send_metric_name = skyline_app_graphite_namespace + '.total_anomalies' send_graphite_metric(skyline_app, send_metric_name, total_anomalies) send_metric_name = skyline_app_graphite_namespace + '.total_metrics' send_graphite_metric(skyline_app, send_metric_name, total_metrics) for key, value in exceptions.items(): send_metric_name = '%s.exceptions.%s' % (skyline_app_graphite_namespace, key) send_graphite_metric(skyline_app, send_metric_name, str(value)) for key, value in anomaly_breakdown.items(): send_metric_name = '%s.anomaly_breakdown.%s' % (skyline_app_graphite_namespace, key) send_graphite_metric(skyline_app, send_metric_name, str(value)) # Check canary metric raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC) if raw_series is not None: unpacker = Unpacker(use_list=False) unpacker.feed(raw_series) timeseries = list(unpacker) time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600 projected = 24 * (time() - now) / time_human logger.info('canary duration :: %.2f' % time_human) send_metric_name = skyline_app_graphite_namespace + '.duration' send_graphite_metric(skyline_app, send_metric_name, str(time_human)) send_metric_name = skyline_app_graphite_namespace + '.projected' send_graphite_metric(skyline_app, send_metric_name, str(projected)) # Reset counters self.anomalous_metrics[:] = [] # Only run once per process_runtime = time() - now try: boundary_optimum_run_duration = settings.BOUNDARY_OPTIMUM_RUN_DURATION except: boundary_optimum_run_duration = 60 if process_runtime < boundary_optimum_run_duration: sleep_for = (boundary_optimum_run_duration - process_runtime) logger.info('sleeping %.2f for seconds' % sleep_for) sleep(sleep_for)
def run(self): """ Called when the process intializes. """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: os_remove(skyline_app_logwait) except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error( 'error - bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os_remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) logger.info('%s :: started worker' % skyline_app) FULL_NAMESPACE = settings.FULL_NAMESPACE MINI_NAMESPACE = settings.MINI_NAMESPACE MAX_RESOLUTION = settings.MAX_RESOLUTION full_uniques = '%sunique_metrics' % FULL_NAMESPACE mini_uniques = '%sunique_metrics' % MINI_NAMESPACE pipe = self.redis_conn.pipeline() last_send_to_graphite = time() queue_sizes = [] # python-2.x and python3.x had while 1 and while True differently # while 1: running = True while running: # Make sure Redis is up try: self.redis_conn.ping() except: logger.error( '%s :: can\'t connect to redis at socket path %s' % (skyline_app, settings.REDIS_SOCKET_PATH)) sleep(10) self.redis_conn = StrictRedis( unix_socket_path=settings.REDIS_SOCKET_PATH) pipe = self.redis_conn.pipeline() continue try: # Get a chunk from the queue with a 15 second timeout chunk = self.q.get(True, 15) now = time() for metric in chunk: # Check if we should skip it if self.in_skip_list(metric[0]): continue # Bad data coming in if metric[1][0] < now - MAX_RESOLUTION: continue # Append to messagepack main namespace key = ''.join((FULL_NAMESPACE, metric[0])) pipe.append(key, packb(metric[1])) pipe.sadd(full_uniques, key) if not self.skip_mini: # Append to mini namespace mini_key = ''.join((MINI_NAMESPACE, metric[0])) pipe.append(mini_key, packb(metric[1])) pipe.sadd(mini_uniques, mini_key) pipe.execute() except Empty: logger.info('%s :: worker queue is empty and timed out' % skyline_app) except WatchError: logger.error('%s :: WatchError - %s' % (skyline_app, str(key))) except NotImplementedError: pass except Exception as e: logger.error('%s :: error: %s' % (skyline_app, str(e))) # Log progress if self.canary: logger.info('%s :: queue size at %d' % (skyline_app, self.q.qsize())) queue_sizes.append(self.q.qsize()) # Only send average queue mertics to graphite once per 10 seconds now = time() last_sent_graphite = now - last_send_to_graphite if last_sent_graphite > 10: number_queue_sizes = len(queue_sizes) total_of_queue_sizes = sum(queue_sizes) if total_of_queue_sizes > 0: average_queue_size = total_of_queue_sizes / number_queue_sizes else: average_queue_size = 0 logger.info( '%s :: total queue size for the last 10 seconds - %s' % (skyline_app, str(total_of_queue_sizes))) logger.info( '%s :: total queue values known for the last 10 seconds - %s' % (skyline_app, str(number_queue_sizes))) logger.info( '%s :: average queue size for the last 10 seconds - %s' % (skyline_app, str(average_queue_size))) # self.send_graphite_metric('skyline.horizon.' + SERVER_METRIC_PATH + 'queue_size', self.q.qsize()) # self.send_graphite_metric('queue_size', average_queue_size) send_metric_name = '%s.queue_size' % skyline_app_graphite_namespace send_graphite_metric(skyline_app, send_metric_name, average_queue_size) # reset queue_sizes and last_sent_graphite queue_sizes = [] last_send_to_graphite = time()
def run(self): """ Called when the process intializes. """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: logger.info('removing %s' % skyline_app_logwait) os.remove(skyline_app_logwait) except OSError: logger.error('error :: failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error( 'error :: bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os.remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error :: failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) while 1: now = time() # Make sure Redis is up try: self.redis_conn.ping() if ENABLE_LUMINOSITY_DEBUG: logger.info('debug :: connected to Redis') except: logger.error( 'error :: cannot connect to redis at socket path %s' % (settings.REDIS_SOCKET_PATH)) sleep(30) # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow if settings.REDIS_PASSWORD: self.redis_conn = StrictRedis( password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) else: self.redis_conn = StrictRedis( unix_socket_path=settings.REDIS_SOCKET_PATH) continue # Report app up try: self.redis_conn.setex(skyline_app, 120, now) logger.info('updated Redis key for %s up' % skyline_app) except: logger.error('error :: failed to update Redis key for %s up' % skyline_app) """ Determine if any new anomalies have been added """ while True: process_anomaly_id = None last_processed_anomaly_id = None memcache_last_processed_anomaly_id_data = False # Check memcached before MySQL memcache_key = '%s.last.processed.anomaly.id' % skyline_app if settings.MEMCACHE_ENABLED: try: last_processed_anomaly_id = self.memcache_client.get( memcache_key) # if memcache does not have the key the response to the # client is None, it does not except except: logger.error( 'error :: failed to get %s from memcache' % memcache_key) try: self.memcache_client.close() except: logger.error( 'error :: failed to close memcache_client') if last_processed_anomaly_id: logger.info( 'last_processed_anomaly_id found in memcache - %s' % str(last_processed_anomaly_id)) memcache_last_processed_anomaly_id_data = True else: logger.info( 'last_processed_anomaly_id key was NOT found in memcache - %s' % str(last_processed_anomaly_id)) if not last_processed_anomaly_id: query = 'SELECT id FROM luminosity WHERE id=(SELECT MAX(id) FROM luminosity) ORDER BY id DESC LIMIT 1' results = None try: results = mysql_select(skyline_app, query) except: logger.error(traceback.format_exc()) logger.error('error :: MySQL quey failed - %s' % query) if results: try: last_processed_anomaly_id = int(results[0][0]) logger.info( 'last_processed_anomaly_id found from DB - %s' % str(last_processed_anomaly_id)) except: logger.error(traceback.format_exc()) if last_processed_anomaly_id and settings.MEMCACHE_ENABLED: if not memcache_last_processed_anomaly_id_data: logger.info( 'Populating memcache with DB result - %s' % str(last_processed_anomaly_id)) try: self.memcache_client.set( memcache_key, int(last_processed_anomaly_id)) logger.info( 'populated memcache key %s with %s' % (memcache_key, str(last_processed_anomaly_id))) except: logger.error( 'error :: failed to set the memcache key - %s - %s' % (memcache_key, str(last_processed_anomaly_id))) try: self.memcache_client.close() except: logger.error( 'error :: failed to close memcache_client' ) if not last_processed_anomaly_id: # Check MySQL now = int(time()) after = now - 600 query = 'SELECT * FROM anomalies WHERE anomaly_timestamp > \'%s\'' % str( after) # nosec results = None try: results = mysql_select(skyline_app, query) except: logger.error('error :: MySQL quey failed - %s' % query) if results: process_anomaly_id = int(results[0][0]) logger.info( 'found new anomaly id to process from the DB - %s' % str(process_anomaly_id)) # Handle the first one last_processed_anomaly_id = process_anomaly_id - 1 else: logger.info('no new anomalies in the anomalies table') query = 'SELECT * FROM anomalies WHERE id > \'%s\'' % str( last_processed_anomaly_id) # nosec results = None try: results = mysql_select(skyline_app, query) except: logger.error('error :: MySQL quey failed - %s' % query) if results: try: process_anomaly_id = int(results[0][0]) logger.info( 'found the next new anomaly id to process from the DB - %s' % str(process_anomaly_id)) except: logger.error(traceback.format_exc()) logger.error('error :: from query - %s' % query) else: logger.info('no new anomalies in the anomalies table') if process_anomaly_id and last_processed_anomaly_id: if isinstance(last_processed_anomaly_id, int): if isinstance(process_anomaly_id, int): if last_processed_anomaly_id == process_anomaly_id: logger.info( 'anomaly id already processed - %s' % str(process_anomaly_id)) process_anomaly_id = None if not process_anomaly_id: logger.info( 'sleeping 20 no anomalies to correlate - last processed anomaly id - %s' % str(last_processed_anomaly_id)) sleep(20) up_now = time() # Report app up try: self.redis_conn.setex(skyline_app, 120, up_now) logger.info('updated Redis key for %s up' % skyline_app) except: logger.error( 'error :: failed to update Redis key for %s up' % skyline_app) cache_key = '%s.sent_graphite_metrics' % skyline_app redis_sent_graphite_metrics = False try: redis_sent_graphite_metrics = self.redis_conn.get( cache_key) except Exception as e: logger.error( 'error :: could not query Redis for key %s: %s' % (cache_key, e)) # Flush metrics to Graphite if not redis_sent_graphite_metrics: try: total_anomalies = str(len(self.anomalous_metrics)) except: total_anomalies = '0' logger.info('total_anomalies :: %s' % total_anomalies) send_metric_name = '%s.total_anomalies' % skyline_app_graphite_namespace send_graphite_metric(skyline_app, send_metric_name, total_anomalies) try: correlations = str(len(self.correlations)) except: correlations = '0' logger.info('correlations :: %s' % correlations) send_metric_name = '%s.correlations' % skyline_app_graphite_namespace send_graphite_metric(skyline_app, send_metric_name, correlations) sent_graphite_metrics_now = int(time()) try: self.redis_conn.setex(cache_key, 59, sent_graphite_metrics_now) logger.info('updated Redis key - %s' % cache_key) except: logger.error( 'error :: failed to update Redis key - %s up' % cache_key) # Reset lists self.anomalous_metrics[:] = [] self.correlations[:] = [] if process_anomaly_id: break # Spawn process logger.info('spawning processes to correlate anomaly id %s' % str(process_anomaly_id)) pids = [] spawned_pids = [] pid_count = 0 now = time() for i in range(1, luminosity_processes + 1): try: p = Process(target=self.spin_process, args=(i, process_anomaly_id)) pids.append(p) pid_count += 1 logger.info('starting %s of %s spin_process/es' % (str(pid_count), str(luminosity_processes))) p.start() spawned_pids.append(p.pid) except: logger.error(traceback.format_exc()) logger.error('error :: failed to start spin_process') continue # Self monitor processes and terminate if any spin_process has run # for to long p_starts = time() while time() - p_starts <= 60: if any(p.is_alive() for p in pids): # Just to avoid hogging the CPU sleep(.1) else: # All the processes are done, break now. time_to_run = time() - p_starts logger.info('%s spin_process completed in %.2f seconds' % (str(luminosity_processes), time_to_run)) break else: # We only enter this if we didn't 'break' above. logger.info('timed out, killing all spin_process processes') for p in pids: try: p.terminate() # p.join() logger.info('killed spin_process process') except: logger.error(traceback.format_exc()) logger.error( 'error :: killing all spin_process processes') for p in pids: if p.is_alive(): logger.info('stopping spin_process - %s' % (str(p.is_alive()))) p.join() process_runtime = time() - now if process_runtime < 10: sleep_for = (10 - process_runtime) logger.info( 'sleeping for %.2f seconds due to low run time...' % sleep_for) sleep(sleep_for) try: del sleep_for except: logger.error('error :: failed to del sleep_for') try: del process_runtime except: logger.error('error :: failed to del process_runtime')
def submit_PrometheusMetrics(self): """ Get Prometheus metrics from the Redis flux.prometheus_metrics hash key add submit them to the Horizon worker queue """ last_hash_check = int(time()) last_log = int(last_hash_check) submitted_metrics = 0 running = True while running: self.check_if_parent_is_alive() now_ts = int(time()) if now_ts >= (last_log + 59): logger.info( 'prometheus :: submit_PrometheusMetrics - submitted %s metrics to worker in the last 60 seconds' % str(submitted_metrics)) submitted_metrics = 0 last_log = int(now_ts) if (now_ts - last_hash_check) < 1: last_hash_check = int(now_ts) continue last_hash_check = int(now_ts) try: prometheus_metrics = {} try: prometheus_metrics = self.redis_conn_decoded.hgetall( 'flux.prometheus_metrics') except Exception as err: logger.error( 'error :: prometheus :: submit_PrometheusMetrics could not determine prometheus_metrics from flux.prometheus_metrics Redis hash - %s' % str(err)) prometheus_metrics = {} if not prometheus_metrics: continue chunk = [] metrics_added = 0 prometheus_metrics_list = list(prometheus_metrics.keys()) for prometheus_metric in prometheus_metrics_list: if metrics_added >= HORIZON_PROMETHEUS_METRIC_LIMIT: break try: metric_data_str = prometheus_metrics[prometheus_metric] # The data is stored by flux in the Redis hash key as # 'metric': 'timestamp,value' to reduce the overhead of # needing to use literal_eval on every call, instead a # simple str split is used timestamp = metric_data_str.split(',')[0] value = metric_data_str.split(',')[1] chunk.append([ prometheus_metric, [int(timestamp), float(value)] ]) metrics_added += 1 submitted_metrics += 1 self.redis_conn_decoded.hdel('flux.prometheus_metrics', prometheus_metric) except Exception as err: logger.error( 'error :: prometheus :: submit_PrometheusMetrics could not determine prometheus_metrics from flux.prometheus_metrics Redis hash - %s' % str(err)) if len(chunk) > settings.CHUNK_SIZE: break if chunk: try: self.q.put(list(chunk), block=False) chunk[:] = [] # Drop chunk if queue is full except Full: chunks_dropped = str(len(chunk)) logger.info( '%s :: prometheus :: submit_PrometheusMetrics :: worker queue is full, dropping %s Prometheus metric datapoints' % (skyline_app, chunks_dropped)) send_metric_name = '%s.prometheus_chunks_dropped' % skyline_app_graphite_namespace send_graphite_metric(skyline_app, send_metric_name, chunks_dropped) chunk[:] = [] except Exception as err: logger.info( '%s :: prometheus :: submit_PrometheusMetrics :: error adding check to queue - %s' % (skyline_app, str(err)))
def run(self): """ Called when the process intializes. """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: os_remove(skyline_app_logwait) except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error( 'error - bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os_remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) logger.info('%s :: started worker' % skyline_app) FULL_NAMESPACE = settings.FULL_NAMESPACE MINI_NAMESPACE = settings.MINI_NAMESPACE MAX_RESOLUTION = settings.MAX_RESOLUTION full_uniques = '%sunique_metrics' % FULL_NAMESPACE mini_uniques = '%sunique_metrics' % MINI_NAMESPACE pipe = self.redis_conn.pipeline() last_send_to_graphite = time() queue_sizes = [] # python-2.x and python3.x handle while 1 and while True differently # while 1: running = True while running: # Make sure Redis is up try: self.redis_conn.ping() except: logger.error( '%s :: cannot connect to Redis at socket path %s' % (skyline_app, settings.REDIS_SOCKET_PATH)) sleep(10) # @modified 20180519 - Feature #2378: Add redis auth to Skyline and rebrow if settings.REDIS_PASSWORD: # @modified 20191014 - Bug #3266: py3 Redis binary objects not strings # Branch #3262: py3 # self.redis_conn = StrictRedis(password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH) self.redis_conn = StrictRedis( password=settings.REDIS_PASSWORD, unix_socket_path=settings.REDIS_SOCKET_PATH, charset='utf-8', decode_responses=True) else: # self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) self.redis_conn = StrictRedis( unix_socket_path=settings.REDIS_SOCKET_PATH, charset='utf-8', decode_responses=True) pipe = self.redis_conn.pipeline() continue try: # Get a chunk from the queue with a 15 second timeout chunk = self.q.get(True, 15) # @modified 20170317 - Feature #1978: worker - DO_NOT_SKIP_LIST # now = time() now = int(time()) for metric in chunk: # Check if we should skip it if self.in_skip_list(metric[0]): continue # Bad data coming in # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7 # Branch #3262: py3 # Bug #3266: py3 Redis binary objects not strings # if metric[1][0] < now - MAX_RESOLUTION: # continue try: if int(metric[1][0]) < now - MAX_RESOLUTION: continue except: pass # Append to messagepack main namespace # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7 # Branch #3262: py3 # Bug #3266: py3 Redis binary objects not strings # key = ''.join((FULL_NAMESPACE, metric[0])) key = ''.join((FULL_NAMESPACE, str(metric[0]))) if LOCAL_DEBUG: logger.info( 'debug :: worker :: adding metric Redis key - %s' % str(key)) # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7 # Branch #3262: py3 # Bug #3266: py3 Redis binary objects not strings # pipe.append(key, packb(metric[1])) # pipe.sadd(full_uniques, key) try: pipe.append(str(key), packb(metric[1])) except Exception as e: logger.error('%s :: error on pipe.append: %s' % (skyline_app, str(e))) try: # pipe.sadd(full_uniques, key) pipe.sadd(full_uniques, str(key)) except Exception as e: logger.error('%s :: error on pipe.sadd: %s' % (skyline_app, str(e))) if not self.skip_mini: # Append to mini namespace # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7 # Branch #3262: py3 # Bug #3266: py3 Redis binary objects not strings # mini_key = ''.join((MINI_NAMESPACE, metric[0])) mini_key = ''.join((MINI_NAMESPACE, str(metric[0]))) pipe.append(mini_key, packb(metric[1])) pipe.sadd(mini_uniques, mini_key) # @modified 20190130 - Task #2690: Test Skyline on Python-3.6.7 # Branch #3262: py3 # Bug #3266: py3 Redis binary objects not strings # pipe.execute() try: pipe.execute() except Exception as e: logger.error('%s :: error on pipe.execute: %s' % (skyline_app, str(e))) except Empty: logger.info('%s :: worker queue is empty and timed out' % skyline_app) except WatchError: logger.error('%s :: WatchError - %s' % (skyline_app, str(key))) except NotImplementedError: pass except Exception as e: # @added 20190130 - Task #2690: Test Skyline on Python-3.6.7 # Branch #3262: py3 # Bug #3266: py3 Redis binary objects not strings # Added traceback logger.error(traceback.format_exc()) logger.error('%s :: error: %s' % (skyline_app, str(e))) # Log progress if self.canary: logger.info('%s :: queue size at %d' % (skyline_app, self.q.qsize())) queue_sizes.append(self.q.qsize()) # Only send average queue mertics to graphite once per 10 seconds now = time() last_sent_graphite = now - last_send_to_graphite if last_sent_graphite > 10: number_queue_sizes = len(queue_sizes) total_of_queue_sizes = sum(queue_sizes) if total_of_queue_sizes > 0: average_queue_size = total_of_queue_sizes / number_queue_sizes else: average_queue_size = 0 logger.info( '%s :: total queue size for the last 10 seconds - %s' % (skyline_app, str(total_of_queue_sizes))) logger.info( '%s :: total queue values known for the last 10 seconds - %s' % (skyline_app, str(number_queue_sizes))) logger.info( '%s :: average queue size for the last 10 seconds - %s' % (skyline_app, str(average_queue_size))) # self.send_graphite_metric('skyline.horizon.' + SERVER_METRIC_PATH + 'queue_size', self.q.qsize()) # self.send_graphite_metric('queue_size', average_queue_size) send_metric_name = '%s.queue_size' % skyline_app_graphite_namespace send_graphite_metric(skyline_app, send_metric_name, average_queue_size) # reset queue_sizes and last_sent_graphite queue_sizes = [] last_send_to_graphite = time()
def run(self): """ Called when the process intializes. """ # Log management to prevent overwriting # Allow the bin/<skyline_app>.d to manage the log if os.path.isfile(skyline_app_logwait): try: os_remove(skyline_app_logwait) except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_logwait) pass now = time() log_wait_for = now + 5 while now < log_wait_for: if os.path.isfile(skyline_app_loglock): sleep(.1) now = time() else: now = log_wait_for + 1 logger.info('starting %s run' % skyline_app) if os.path.isfile(skyline_app_loglock): logger.error('error - bin/%s.d log management seems to have failed, continuing' % skyline_app) try: os_remove(skyline_app_loglock) logger.info('log lock file removed') except OSError: logger.error('error - failed to remove %s, continuing' % skyline_app_loglock) pass else: logger.info('bin/%s.d log management done' % skyline_app) logger.info('%s :: started worker' % skyline_app) FULL_NAMESPACE = settings.FULL_NAMESPACE MINI_NAMESPACE = settings.MINI_NAMESPACE MAX_RESOLUTION = settings.MAX_RESOLUTION full_uniques = '%sunique_metrics' % FULL_NAMESPACE mini_uniques = '%sunique_metrics' % MINI_NAMESPACE pipe = self.redis_conn.pipeline() last_send_to_graphite = time() queue_sizes = [] # python-2.x and python3.x had while 1 and while True differently # while 1: running = True while running: # Make sure Redis is up try: self.redis_conn.ping() except: logger.error('%s :: can\'t connect to redis at socket path %s' % (skyline_app, settings.REDIS_SOCKET_PATH)) sleep(10) self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH) pipe = self.redis_conn.pipeline() continue try: # Get a chunk from the queue with a 15 second timeout chunk = self.q.get(True, 15) now = time() for metric in chunk: # Check if we should skip it if self.in_skip_list(metric[0]): continue # Bad data coming in if metric[1][0] < now - MAX_RESOLUTION: continue # Append to messagepack main namespace key = ''.join((FULL_NAMESPACE, metric[0])) pipe.append(key, packb(metric[1])) pipe.sadd(full_uniques, key) if not self.skip_mini: # Append to mini namespace mini_key = ''.join((MINI_NAMESPACE, metric[0])) pipe.append(mini_key, packb(metric[1])) pipe.sadd(mini_uniques, mini_key) pipe.execute() except Empty: logger.info('%s :: worker queue is empty and timed out' % skyline_app) except WatchError: logger.error('%s :: WatchError - %s' % (skyline_app, str(key))) except NotImplementedError: pass except Exception as e: logger.error('%s :: error: %s' % (skyline_app, str(e))) # Log progress if self.canary: logger.info('%s :: queue size at %d' % (skyline_app, self.q.qsize())) queue_sizes.append(self.q.qsize()) # Only send average queue mertics to graphite once per 10 seconds now = time() last_sent_graphite = now - last_send_to_graphite if last_sent_graphite > 10: number_queue_sizes = len(queue_sizes) total_of_queue_sizes = sum(queue_sizes) if total_of_queue_sizes > 0: average_queue_size = total_of_queue_sizes / number_queue_sizes else: average_queue_size = 0 logger.info('%s :: total queue size for the last 10 seconds - %s' % (skyline_app, str(total_of_queue_sizes))) logger.info('%s :: total queue values known for the last 10 seconds - %s' % (skyline_app, str(number_queue_sizes))) logger.info('%s :: average queue size for the last 10 seconds - %s' % (skyline_app, str(average_queue_size))) # self.send_graphite_metric('skyline.horizon.' + SERVER_METRIC_PATH + 'queue_size', self.q.qsize()) # self.send_graphite_metric('queue_size', average_queue_size) send_metric_name = '%s.queue_size' % skyline_app_graphite_namespace send_graphite_metric(skyline_app, send_metric_name, average_queue_size) # reset queue_sizes and last_sent_graphite queue_sizes = [] last_send_to_graphite = time()