Ejemplo n.º 1
0
    def spin_process(self, i, run_timestamp, metric_check_file):
        """
        Assign a metric for a process to analyze.

        :param i: python process id
        :param run_timestamp: the epoch timestamp at which this process was called
        :param metric_check_file: full path to the metric check file

        :return: returns True

        """

        child_process_pid = os.getpid()
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('child_process_pid - %s' % str(child_process_pid))

        self.process_list.append(child_process_pid)

        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('processing metric check - %s' % metric_check_file)

        if not os.path.isfile(str(metric_check_file)):
            logger.error('error :: file not found - metric_check_file - %s' %
                         (str(metric_check_file)))
            return

        check_file_name = os.path.basename(str(metric_check_file))
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_name - %s' % check_file_name)
        check_file_timestamp = check_file_name.split('.', 1)[0]
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_timestamp - %s' %
                        str(check_file_timestamp))
        check_file_metricname_txt = check_file_name.split('.', 1)[1]
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_metricname_txt - %s' %
                        check_file_metricname_txt)
        check_file_metricname = check_file_metricname_txt.replace('.txt', '')
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_metricname - %s' % check_file_metricname)
        check_file_metricname_dir = check_file_metricname.replace('.', '/')
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_metricname_dir - %s' %
                        check_file_metricname_dir)

        metric_failed_check_dir = failed_checks_dir + '/' + check_file_metricname_dir + '/' + check_file_timestamp
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('metric_failed_check_dir - %s' %
                        metric_failed_check_dir)

        # failed_check_file = failed_checks_dir + '/' + check_file_name
        failed_check_file = metric_failed_check_dir + '/' + check_file_name

        # Load and validate metric variables
        try:
            metric_vars = load_metric_vars(skyline_app, str(metric_check_file))
        except:
            logger.error(
                'error :: failed to import metric variables from check file -  %s'
                % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir,
                       str(metric_check_file))
            # TBD - a failed check Panorama update will go here, perhaps txt
            #       files are not the only "queue" that will be used, both, but
            #       Panorama, may be just a part of Skyline Flux, the flux DB
            #       would allow for a very nice, distributed "queue" and a
            #       distributed Skyline workforce...
            #       Any Skyline node could just have one role, e.g. lots of
            #       Skyline nodes running crucible only and instead of reading
            #       the local filesystem for input, they could read the Flux DB
            #       queue or both...
            return

        # Test metric variables
        # We use a pythonic methodology to test if the variables are defined,
        # this ensures that if any of the variables are not set for some reason
        # we can handle unexpected data or situations gracefully and try and
        # ensure that the process does not hang.

        # if len(str(metric_vars.metric)) == 0:
        # if not metric_vars.metric:
        try:
            metric_vars.metric
        except:
            logger.error(
                'error :: failed to read metric variable from check file -  %s'
                % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir,
                       str(metric_check_file))
            return
        else:
            metric = str(metric_vars.metric)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - metric - %s' % metric)

        # if len(metric_vars.value) == 0:
        # if not metric_vars.value:
        try:
            metric_vars.value
        except:
            logger.error(
                'error :: failed to read value variable from check file -  %s'
                % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir,
                       str(metric_check_file))
            return
        else:
            value = str(metric_vars.value)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - value - %s' % (value))

        # if len(metric_vars.from_timestamp) == 0:
        # if not metric_vars.from_timestamp:
        try:
            metric_vars.from_timestamp
        except:
            logger.error(
                'error :: failed to read from_timestamp variable from check file -  %s'
                % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir,
                       str(metric_check_file))
            return
        else:
            from_timestamp = str(metric_vars.from_timestamp)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - from_timestamp - %s' %
                            from_timestamp)

        # if len(metric_vars.metric_timestamp) == 0:
        # if not metric_vars.metric_timestamp:
        try:
            metric_vars.metric_timestamp
        except:
            logger.error(
                'error :: failed to read metric_timestamp variable from check file -  %s'
                % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir,
                       str(metric_check_file))
            return
        else:
            metric_timestamp = str(metric_vars.metric_timestamp)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - metric_timestamp - %s' %
                            metric_timestamp)

        # if len(metric_vars.algorithms) == 0:
        # if not metric_vars.algorithms:
        try:
            metric_vars.algorithms
        except:
            logger.error(
                'error :: failed to read algorithms variable from check file setting to all'
                % (metric_check_file))
            algorithms = ['all']
        else:
            algorithms = []
            for i_algorithm in metric_vars.algorithms:
                algorithms.append(i_algorithm)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - algorithms - %s' % algorithms)

        # if len(metric_vars.anomaly_dir) == 0:
        # if not metric_vars.anomaly_dir:
        try:
            metric_vars.anomaly_dir
        except:
            logger.error(
                'error :: failed to read anomaly_dir variable from check file -  %s'
                % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir,
                       str(metric_check_file))
            return
        else:
            anomaly_dir = str(metric_vars.anomaly_dir)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - anomaly_dir - %s' % anomaly_dir)

        # if len(str(metric_vars.graphite_metric)) == 0:
        try:
            metric_vars.graphite_metric
        except:
            logger.info(
                'failed to read graphite_metric variable from check file setting to False'
            )
            # yes this is a string
            graphite_metric = 'False'
        else:
            graphite_metric = str(metric_vars.graphite_metric)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - graphite_metric - %s' %
                            graphite_metric)

        # if len(str(metric_vars.run_crucible_tests)) == 0:
        try:
            metric_vars.run_crucible_tests
        except:
            logger.info(
                'failed to read run_crucible_tests variable from check file setting to False'
            )
            # yes this is a string
            run_crucible_tests = 'False'
        else:
            run_crucible_tests = str(metric_vars.run_crucible_tests)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - run_crucible_tests - %s' %
                            run_crucible_tests)

        try:
            metric_vars.added_by
        except:
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info(
                    'failed to read added_by variable from check file setting to crucible - set to crucible'
                )
            added_by = 'crucible'
        else:
            added_by = str(metric_vars.added_by)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - added_by - %s' % added_by)

        try:
            metric_vars.run_script
        except:
            run_script = False
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info(
                    'metric variable - run_script - not present set to False')
        else:
            run_script = str(metric_vars.run_script)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - run_script - %s' % run_script)

        # Only check if the metric does not a EXPIRATION_TIME key set, crucible
        # uses the alert EXPIRATION_TIME for the relevant alert setting contexts
        # whether that be analyzer, mirage, boundary, etc and sets its own
        # cache_keys in redis.  This prevents large amounts of data being added
        # in terms of tieseries json and image files, crucible samples at the
        # same EXPIRATION_TIME as alerts.

        source_app = 'crucible'
        expiration_timeout = 1800
        remove_all_anomaly_files = False
        check_expired = False
        check_time = time()

        if added_by == 'analyzer' or added_by == 'mirage':
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('Will check %s ALERTS' % added_by)
            if settings.ENABLE_ALERTS:
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('Checking %s ALERTS' % added_by)
                for alert in settings.ALERTS:
                    ALERT_MATCH_PATTERN = alert[0]
                    METRIC_PATTERN = metric
                    alert_match_pattern = re.compile(ALERT_MATCH_PATTERN)
                    pattern_match = alert_match_pattern.match(METRIC_PATTERN)
                    if pattern_match:
                        source_app = added_by
                        expiration_timeout = alert[2]
                        if settings.ENABLE_CRUCIBLE_DEBUG:
                            logger.info(
                                'matched - %s - %s - EXPIRATION_TIME is %s' %
                                (source_app, metric, str(expiration_timeout)))
                        check_age = int(check_time) - int(metric_timestamp)
                        if int(check_age) > int(expiration_timeout):
                            check_expired = True
                            if settings.ENABLE_CRUCIBLE_DEBUG:
                                logger.info(
                                    'the check is older than EXPIRATION_TIME for the metric - not checking - check_expired'
                                )

        if added_by == 'boundary':
            if settings.BOUNDARY_ENABLE_ALERTS:
                for alert in settings.BOUNDARY_METRICS:
                    ALERT_MATCH_PATTERN = alert[0]
                    METRIC_PATTERN = metric
                    alert_match_pattern = re.compile(ALERT_MATCH_PATTERN)
                    pattern_match = alert_match_pattern.match(METRIC_PATTERN)
                    if pattern_match:
                        source_app = 'boundary'
                        expiration_timeout = alert[2]
                        if settings.ENABLE_CRUCIBLE_DEBUG:
                            logger.info(
                                'matched - %s - %s - EXPIRATION_TIME is %s' %
                                (source_app, metric, str(expiration_timeout)))
                        check_age = int(check_time) - int(metric_timestamp)
                        if int(check_age) > int(expiration_timeout):
                            check_expired = True
                            if settings.ENABLE_CRUCIBLE_DEBUG:
                                logger.info(
                                    'the check is older than EXPIRATION_TIME for the metric - not checking - check_expired'
                                )

        cache_key = 'crucible.last_check.%s.%s' % (source_app, metric)
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('cache_key - crucible.last_check.%s.%s' %
                        (source_app, metric))

        # Only use the cache_key EXPIRATION_TIME if this is not a request to
        # run_crucible_tests on a timeseries
        if run_crucible_tests == 'False':
            if check_expired:
                logger.info('check_expired - not checking Redis key')
                last_check = True
            else:
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('Checking if cache_key exists')
                try:
                    last_check = self.redis_conn.get(cache_key)
                except Exception as e:
                    logger.error(
                        'error :: could not query cache_key for %s - %s - %s' %
                        (source_app, metric, e))
                    logger.info('all anomaly files will be removed')
                    remove_all_anomaly_files = True

            if not last_check:
                try:
                    self.redis_conn.setex(cache_key, expiration_timeout,
                                          packb(value))
                    logger.info(
                        'set cache_key for %s - %s with timeout of %s' %
                        (source_app, metric, str(expiration_timeout)))
                except Exception as e:
                    logger.error(
                        'error :: could not query cache_key for %s - %s - %s' %
                        (source_app, metric, e))
                    logger.info('all anomaly files will be removed')
                    remove_all_anomaly_files = True
            else:
                if check_expired:
                    logger.info(
                        'check_expired - all anomaly files will be removed')
                    remove_all_anomaly_files = True
                else:
                    logger.info(
                        'cache_key is set and not expired for %s - %s - all anomaly files will be removed'
                        % (source_app, metric))
                    remove_all_anomaly_files = True

        # anomaly dir
        if not os.path.exists(str(anomaly_dir)):
            try:
                # mkdir_p(skyline_app, str(anomaly_dir))
                if python_version == 2:
                    mode_arg = int('0755')
                if python_version == 3:
                    mode_arg = mode = 0o755
                os.makedirs(anomaly_dir, mode_arg)
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('created anomaly dir - %s' % str(anomaly_dir))
            except:
                logger.error('error :: failed to create anomaly_dir - %s' %
                             str(anomaly_dir))

        if not os.path.exists(str(anomaly_dir)):
            logger.error('error :: anomaly_dir does not exist')
            fail_check(skyline_app, metric_failed_check_dir,
                       str(metric_check_file))
            return
        else:
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('anomaly dir exists - %s' % str(anomaly_dir))

        failed_check_file = anomaly_dir + '/' + metric_timestamp + '.failed.check.' + metric + '.txt'
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('failed_check_file - %s' % str(failed_check_file))

        # Retrieve data from graphite is necessary
        anomaly_graph = anomaly_dir + '/' + metric + '.png'
        anomaly_json = anomaly_dir + '/' + metric + '.json'
        anomaly_json_gz = anomaly_json + '.gz'
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('anomaly_graph - %s' % str(anomaly_graph))
            logger.info('anomaly_json - %s' % str(anomaly_json))
            logger.info('anomaly_json_gz - %s' % str(anomaly_json_gz))

        # Some things added to crucible may not be added by a skyline app per se
        # and if run_crucible_tests is string True then no anomaly files should
        # be removed.
        if run_crucible_tests == 'True':
            remove_all_anomaly_files = False

        # Remove check and anomaly files if the metric has a EXPIRATION_TIME
        # cache_key set
        if remove_all_anomaly_files:
            if os.path.isfile(anomaly_graph):
                try:
                    os.remove(anomaly_graph)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_graph removed - %s' %
                                    str(anomaly_graph))
                except OSError:
                    pass
            if os.path.isfile(anomaly_json):
                try:
                    os.remove(anomaly_json)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_json removed - %s' %
                                    str(anomaly_json))
                except OSError:
                    pass
            if os.path.isfile(anomaly_json_gz):
                try:
                    os.remove(anomaly_json_gz)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_json_gz removed - %s' %
                                    str(anomaly_json_gz))
                except OSError:
                    pass

            anomaly_txt_file = anomaly_dir + '/' + metric + '.txt'
            if os.path.isfile(anomaly_txt_file):
                try:
                    os.remove(anomaly_txt_file)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_txt_file removed - %s' %
                                    str(anomaly_txt_file))
                except OSError:
                    pass

            # TBD - this data would have to be added to the panaorama DB before
            #       it is removed
            if os.path.isfile(str(metric_check_file)):
                try:
                    os.remove(str(metric_check_file))
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('metric_check_file removed - %s' %
                                    str(metric_check_file))
                except OSError:
                    pass

            if os.path.exists(str(anomaly_dir)):
                try:
                    os.rmdir(str(anomaly_dir))
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_dir removed - %s' %
                                    str(anomaly_dir))
                except OSError:
                    pass

            logger.info('check and anomaly files removed')
            return

        # Check if the image exists
        if graphite_metric == 'True':

            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('graphite_metric - %s' % (graphite_metric))

            # Graphite timeouts
            connect_timeout = int(settings.GRAPHITE_CONNECT_TIMEOUT)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('connect_timeout - %s' % str(connect_timeout))

            read_timeout = int(settings.GRAPHITE_READ_TIMEOUT)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('read_timeout - %s' % str(read_timeout))

            graphite_until = datetime.datetime.fromtimestamp(
                int(metric_timestamp)).strftime('%H:%M_%Y%m%d')
            graphite_from = datetime.datetime.fromtimestamp(
                int(from_timestamp)).strftime('%H:%M_%Y%m%d')

            # graphite URL
            if settings.GRAPHITE_PORT != '':
                url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + ':' + settings.GRAPHITE_PORT + '/render/?from=' + graphite_from + '&until=' + graphite_until + '&target=' + metric + '&format=json'
            else:
                url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + '/render/?from=' + graphite_from + '&until=' + graphite_until + '&target=' + metric + '&format=json'
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('graphite url - %s' % (url))

            if not os.path.isfile(anomaly_graph):
                logger.info(
                    'retrieving png - surfacing %s graph from graphite from %s to %s'
                    % (metric, graphite_from, graphite_until))

                image_url = url.replace('&format=json', '')
                graphite_image_file = anomaly_dir + '/' + metric + '.png'
                if 'width' not in image_url:
                    image_url += '&width=586'
                if 'height' not in image_url:
                    image_url += '&height=308'
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('graphite image url - %s' % (image_url))
                image_url_timeout = int(connect_timeout)

                image_data = None

                try:
                    image_data = urllib2.urlopen(
                        image_url, timeout=image_url_timeout).read()
                    logger.info('url OK - %s' % (image_url))
                except urllib2.URLError:
                    image_data = None
                    logger.error('error :: url bad - %s' % (image_url))

                if image_data is not None:
                    with open(graphite_image_file, 'w') as f:
                        f.write(image_data)
                    logger.info('retrieved - %s' % (anomaly_graph))
                    if python_version == 2:
                        mode_arg = int('0644')
                    if python_version == 3:
                        mode_arg = '0o644'
                    os.chmod(graphite_image_file, mode_arg)
                else:
                    logger.error('error :: failed to retrieved - %s' %
                                 (anomaly_graph))
            else:
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('anomaly_graph file exists - %s' %
                                str(anomaly_graph))

            if not os.path.isfile(anomaly_graph):
                logger.error(
                    'error :: retrieve failed to surface %s graph from graphite'
                    % (metric))
            else:
                logger.info('graph image exists - %s' % (anomaly_graph))

            # Check if the json exists
            if not os.path.isfile(anomaly_json_gz):
                if not os.path.isfile(anomaly_json):
                    logger.info(
                        'surfacing timeseries data for %s from graphite from %s to %s'
                        % (metric, graphite_from, graphite_until))
                    if requests.__version__ >= '2.4.0':
                        use_timeout = (int(connect_timeout), int(read_timeout))
                    else:
                        use_timeout = int(connect_timeout)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('use_timeout - %s' % (str(use_timeout)))

                    try:
                        r = requests.get(url, timeout=use_timeout)
                        js = r.json()
                        datapoints = js[0]['datapoints']
                        if settings.ENABLE_CRUCIBLE_DEBUG:
                            logger.info('data retrieved OK')
                    except:
                        datapoints = [[None, int(graphite_until)]]
                        logger.error('error :: data retrieval failed')

                    converted = []
                    for datapoint in datapoints:
                        try:
                            new_datapoint = [
                                float(datapoint[1]),
                                float(datapoint[0])
                            ]
                            converted.append(new_datapoint)
                        except:
                            continue

                    with open(anomaly_json, 'w') as f:
                        f.write(json.dumps(converted))
                    if python_version == 2:
                        mode_arg = int('0644')
                    if python_version == 3:
                        mode_arg = '0o644'
                    os.chmod(anomaly_json, mode_arg)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('json file - %s' % anomaly_json)

                if not os.path.isfile(anomaly_json):
                    logger.error(
                        'error :: failed to surface %s json from graphite' %
                        (metric))
                    # Move metric check file
                    try:
                        shutil.move(metric_check_file, failed_check_file)
                        logger.info('moved check file to - %s' %
                                    failed_check_file)
                    except OSError:
                        logger.error(
                            'error :: failed to move check file to - %s' %
                            failed_check_file)
                        pass
                    return

        # Check timeseries json exists - raw or gz
        if not os.path.isfile(anomaly_json):
            if not os.path.isfile(anomaly_json_gz):
                logger.error('error :: no json data found' % (metric))
                # Move metric check file
                try:
                    shutil.move(metric_check_file, failed_check_file)
                    logger.info('moved check file to - %s' % failed_check_file)
                except OSError:
                    logger.error('error :: failed to move check file to - %s' %
                                 failed_check_file)
                    pass
                return
            else:
                logger.info('timeseries json gzip exists - %s' %
                            (anomaly_json_gz))
        else:
            logger.info('timeseries json exists - %s' % (anomaly_json))

        # If timeseries json and run_crucible_tests is str(False) gzip and
        # return here as there is nothing further to do
        if run_crucible_tests == 'False':
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('run_crucible_tests - %s' % run_crucible_tests)
            # gzip the json timeseries data
            if os.path.isfile(anomaly_json):
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('gzipping - %s' % anomaly_json)
                try:
                    f_in = open(anomaly_json)
                    f_out = gzip.open(anomaly_json_gz, 'wb')
                    f_out.writelines(f_in)
                    f_out.close()
                    f_in.close()
                    os.remove(anomaly_json)
                    if python_version == 2:
                        mode_arg = int('0644')
                    if python_version == 3:
                        mode_arg = '0o644'
                    os.chmod(anomaly_json_gz, mode_arg)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('gzipped - %s' % anomaly_json_gz)
                    try:
                        os.remove(metric_check_file)
                        logger.info('removed check file - %s' %
                                    metric_check_file)
                    except OSError:
                        pass
                    return
                except:
                    logger.error('error :: Failed to gzip data file - %s' %
                                 str(traceback.print_exc()))
                    try:
                        os.remove(metric_check_file)
                        logger.info('removed check file - %s' %
                                    metric_check_file)
                    except OSError:
                        pass
                    return

            if os.path.isfile(anomaly_json_gz):
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('gzip exists - %s' % anomaly_json)
                    try:
                        os.remove(metric_check_file)
                        logger.info('removed check file - %s' %
                                    metric_check_file)
                    except OSError:
                        pass
                    return
            nothing_to_do = 'true - for debug only'

        # self.check_if_parent_is_alive()
        # Run crucible algorithms
        logger.info('running crucible tests - %s' % (metric))

        if os.path.isfile(anomaly_json_gz):
            if not os.path.isfile(anomaly_json):
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('ungzipping - %s' % anomaly_json_gz)
                try:
                    # with gzip.open(anomaly_json_gz, 'rb') as fr:
                    fr = gzip.open(anomaly_json_gz, 'rb')
                    raw_timeseries = fr.read()
                    fr.close()
                except Exception as e:
                    logger.error('error :: could not ungzip %s - %s' %
                                 (anomaly_json_gz, e))
                    traceback.print_exc()
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('ungzipped')
                    logger.info('writing to - %s' % anomaly_json)
                with open(anomaly_json, 'w') as fw:
                    fw.write(raw_timeseries)
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('anomaly_json done')
                if python_version == 2:
                    mode_arg = int('0644')
                if python_version == 3:
                    mode_arg = '0o644'
                os.chmod(anomaly_json, mode_arg)
        else:
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('No gzip - %s' % anomaly_json_gz)
            nothing_to_do = 'true - for debug only'

        if os.path.isfile(anomaly_json):
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('anomaly_json exists - %s' % anomaly_json)

        if os.path.isfile(anomaly_json):
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('loading timeseries from - %s' % anomaly_json)
            with open(anomaly_json, 'r') as f:
                timeseries = json.loads(f.read())
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('loaded timeseries from - %s' % anomaly_json)
        else:
            try:
                logger.error('error :: file not found - %s' % anomaly_json)
                shutil.move(metric_check_file, failed_check_file)
                if python_version == 2:
                    mode_arg = int('0644')
                if python_version == 3:
                    mode_arg = '0o644'
                os.chmod(failed_check_file, mode_arg)
                logger.info('moved check file to - %s' % failed_check_file)
            except OSError:
                logger.error('error :: failed to move check file to - %s' %
                             failed_check_file)
                pass
            return

        start_timestamp = int(timeseries[0][0])
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('start_timestamp - %s' % str(start_timestamp))
        end_timestamp = int(timeseries[-1][0])
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('end_timestamp - %s' % str(end_timestamp))

        full_duration = end_timestamp - start_timestamp
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('full_duration - %s' % str(full_duration))

        self.check_if_parent_is_alive()

        run_algorithms_start_timestamp = int(time())
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('run_algorithms_start_timestamp - %s' %
                        str(run_algorithms_start_timestamp))

        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('run_algorithms - %s,%s,%s,%s,%s,%s' %
                        (metric, str(end_timestamp), str(full_duration),
                         anomaly_json, skyline_app, str(algorithms)))
        try:
            anomalous, ensemble = run_algorithms(timeseries, str(metric),
                                                 end_timestamp, full_duration,
                                                 str(anomaly_json),
                                                 skyline_app, algorithms)
        except:
            logger.error('error :: run_algorithms failed - %s' %
                         str(traceback.print_exc()))

        run_algorithms_end_timestamp = int(time())
        run_algorithms_seconds = run_algorithms_end_timestamp - run_algorithms_start_timestamp

        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('anomalous, ensemble - %s, %s' %
                        (anomalous, str(ensemble)))

        if anomalous:
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('anomalous - %s' % (anomalous))
            nothing_to_do = 'true - for debug only'

        logger.info('run_algorithms took %s seconds' %
                    str(run_algorithms_seconds))

        # Update anomaly file
        crucible_data = 'crucible_tests_run = "%s"\n' \
                        'crucible_triggered_algorithms = %s\n' \
                        'tested_by = "%s"\n' \
            % (str(run_timestamp), str(ensemble), str(this_host))
        crucible_anomaly_file = '%s/%s.txt' % (anomaly_dir, metric)
        with open(crucible_anomaly_file, 'a') as fh:
            fh.write(crucible_data)
        if python_version == 2:
            mode_arg = int('0644')
        if python_version == 3:
            mode_arg = '0o644'
        os.chmod(crucible_anomaly_file, mode_arg)
        logger.info('updated crucible anomaly file - %s/%s.txt' %
                    (anomaly_dir, metric))

        # gzip the json timeseries data after analysis
        if os.path.isfile(anomaly_json):
            if not os.path.isfile(anomaly_json_gz):
                try:
                    f_in = open(anomaly_json)
                    f_out = gzip.open(anomaly_json_gz, 'wb')
                    f_out.writelines(f_in)
                    f_out.close()
                    f_in.close()
                    os.remove(anomaly_json)
                    if python_version == 2:
                        mode_arg = int('0644')
                    if python_version == 3:
                        mode_arg = '0o644'
                    os.chmod(anomaly_json_gz, mode_arg)
                    logger.info('gzipped - %s' % (anomaly_json_gz))
                except:
                    logger.error('error :: Failed to gzip data file - %s' %
                                 str(traceback.print_exc()))
            else:
                os.remove(anomaly_json)

        if run_script:
            if os.path.isfile(run_script):
                logger.info('running - %s' % (run_script))
                os.system('%s %s' %
                          (str(run_script), str(crucible_anomaly_file)))

        # Remove metric check file
        nothing_to_do = ''

        try:
            os.remove(metric_check_file)
            logger.info('complete removed check file - %s %s' %
                        (metric_check_file, nothing_to_do))
        except OSError:
            pass
Ejemplo n.º 2
0
    def spin_process(self, i, run_timestamp, metric_check_file):
        """
        Assign a metric for a process to analyze.

        :param i: python process id
        :param run_timestamp: the epoch timestamp at which this process was called
        :param metric_check_file: full path to the metric check file

        :return: returns True

        """

        child_process_pid = os.getpid()
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('child_process_pid - %s' % str(child_process_pid))

        self.process_list.append(child_process_pid)

        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('processing metric check - %s' % metric_check_file)

        if not os.path.isfile(str(metric_check_file)):
            logger.error('error :: file not found - metric_check_file - %s' % (str(metric_check_file)))
            return

        check_file_name = os.path.basename(str(metric_check_file))
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_name - %s' % check_file_name)
        check_file_timestamp = check_file_name.split('.', 1)[0]
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_timestamp - %s' % str(check_file_timestamp))
        check_file_metricname_txt = check_file_name.split('.', 1)[1]
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_metricname_txt - %s' % check_file_metricname_txt)
        check_file_metricname = check_file_metricname_txt.replace('.txt', '')
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_metricname - %s' % check_file_metricname)
        check_file_metricname_dir = check_file_metricname.replace('.', '/')
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('check_file_metricname_dir - %s' % check_file_metricname_dir)

        metric_failed_check_dir = failed_checks_dir + '/' + check_file_metricname_dir + '/' + check_file_timestamp
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('metric_failed_check_dir - %s' % metric_failed_check_dir)

        # failed_check_file = failed_checks_dir + '/' + check_file_name
        failed_check_file = metric_failed_check_dir + '/' + check_file_name

        # Load and validate metric variables
        try:
            metric_vars = load_metric_vars(skyline_app, str(metric_check_file))
        except:
            logger.error('error :: failed to import metric variables from check file -  %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            # TBD - a failed check Panorama update will go here, perhaps txt
            #       files are not the only "queue" that will be used, both, but
            #       Panorama, may be just a part of Skyline Flux, the flux DB
            #       would allow for a very nice, distributed "queue" and a
            #       distributed Skyline workforce...
            #       Any Skyline node could just have one role, e.g. lots of
            #       Skyline nodes running crucible only and instead of reading
            #       the local filesystem for input, they could read the Flux DB
            #       queue or both...
            return

        # Test metric variables
        # We use a pythonic methodology to test if the variables are defined,
        # this ensures that if any of the variables are not set for some reason
        # we can handle unexpected data or situations gracefully and try and
        # ensure that the process does not hang.

        # if len(str(metric_vars.metric)) == 0:
        # if not metric_vars.metric:
        try:
            metric_vars.metric
        except:
            logger.error('error :: failed to read metric variable from check file -  %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return
        else:
            metric = str(metric_vars.metric)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - metric - %s' % metric)

        # if len(metric_vars.value) == 0:
        # if not metric_vars.value:
        try:
            metric_vars.value
        except:
            logger.error('error :: failed to read value variable from check file -  %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return
        else:
            value = str(metric_vars.value)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - value - %s' % (value))

        # if len(metric_vars.from_timestamp) == 0:
        # if not metric_vars.from_timestamp:
        try:
            metric_vars.from_timestamp
        except:
            logger.error('error :: failed to read from_timestamp variable from check file -  %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return
        else:
            from_timestamp = str(metric_vars.from_timestamp)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - from_timestamp - %s' % from_timestamp)

        # if len(metric_vars.metric_timestamp) == 0:
        # if not metric_vars.metric_timestamp:
        try:
            metric_vars.metric_timestamp
        except:
            logger.error('error :: failed to read metric_timestamp variable from check file -  %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return
        else:
            metric_timestamp = str(metric_vars.metric_timestamp)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - metric_timestamp - %s' % metric_timestamp)

        # if len(metric_vars.algorithms) == 0:
        # if not metric_vars.algorithms:
        try:
            metric_vars.algorithms
        except:
            logger.error('error :: failed to read algorithms variable from check file setting to all' % (metric_check_file))
            algorithms = ['all']
        else:
            algorithms = []
            for i_algorithm in metric_vars.algorithms:
                algorithms.append(i_algorithm)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - algorithms - %s' % algorithms)

        # if len(metric_vars.anomaly_dir) == 0:
        # if not metric_vars.anomaly_dir:
        try:
            metric_vars.anomaly_dir
        except:
            logger.error('error :: failed to read anomaly_dir variable from check file -  %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return
        else:
            anomaly_dir = str(metric_vars.anomaly_dir)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - anomaly_dir - %s' % anomaly_dir)

        # if len(str(metric_vars.graphite_metric)) == 0:
        try:
            metric_vars.graphite_metric
        except:
            logger.info('failed to read graphite_metric variable from check file setting to False')
            # yes this is a string
            graphite_metric = 'False'
        else:
            graphite_metric = str(metric_vars.graphite_metric)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - graphite_metric - %s' % graphite_metric)

        # if len(str(metric_vars.run_crucible_tests)) == 0:
        try:
            metric_vars.run_crucible_tests
        except:
            logger.info('failed to read run_crucible_tests variable from check file setting to False')
            # yes this is a string
            run_crucible_tests = 'False'
        else:
            run_crucible_tests = str(metric_vars.run_crucible_tests)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - run_crucible_tests - %s' % run_crucible_tests)

        try:
            metric_vars.added_by
        except:
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('failed to read added_by variable from check file setting to crucible - set to crucible')
            added_by = 'crucible'
        else:
            added_by = str(metric_vars.added_by)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - added_by - %s' % added_by)

        try:
            metric_vars.run_script
        except:
            run_script = False
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - run_script - not present set to False')
        else:
            run_script = str(metric_vars.run_script)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('metric variable - run_script - %s' % run_script)

        # Only check if the metric does not a EXPIRATION_TIME key set, crucible
        # uses the alert EXPIRATION_TIME for the relevant alert setting contexts
        # whether that be analyzer, mirage, boundary, etc and sets its own
        # cache_keys in redis.  This prevents large amounts of data being added
        # in terms of tieseries json and image files, crucible samples at the
        # same EXPIRATION_TIME as alerts.

        source_app = 'crucible'
        expiration_timeout = 1800
        remove_all_anomaly_files = False
        check_expired = False
        check_time = time()

        if added_by == 'analyzer' or added_by == 'mirage':
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('Will check %s ALERTS' % added_by)
            if settings.ENABLE_ALERTS:
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('Checking %s ALERTS' % added_by)
                for alert in settings.ALERTS:
                    ALERT_MATCH_PATTERN = alert[0]
                    METRIC_PATTERN = metric
                    alert_match_pattern = re.compile(ALERT_MATCH_PATTERN)
                    pattern_match = alert_match_pattern.match(METRIC_PATTERN)
                    if pattern_match:
                        source_app = added_by
                        expiration_timeout = alert[2]
                        if settings.ENABLE_CRUCIBLE_DEBUG:
                            logger.info('matched - %s - %s - EXPIRATION_TIME is %s' % (source_app, metric, str(expiration_timeout)))
                        check_age = int(check_time) - int(metric_timestamp)
                        if int(check_age) > int(expiration_timeout):
                            check_expired = True
                            if settings.ENABLE_CRUCIBLE_DEBUG:
                                logger.info('the check is older than EXPIRATION_TIME for the metric - not checking - check_expired')

        if added_by == 'boundary':
            if settings.BOUNDARY_ENABLE_ALERTS:
                for alert in settings.BOUNDARY_METRICS:
                    ALERT_MATCH_PATTERN = alert[0]
                    METRIC_PATTERN = metric
                    alert_match_pattern = re.compile(ALERT_MATCH_PATTERN)
                    pattern_match = alert_match_pattern.match(METRIC_PATTERN)
                    if pattern_match:
                        source_app = 'boundary'
                        expiration_timeout = alert[2]
                        if settings.ENABLE_CRUCIBLE_DEBUG:
                            logger.info('matched - %s - %s - EXPIRATION_TIME is %s' % (source_app, metric, str(expiration_timeout)))
                        check_age = int(check_time) - int(metric_timestamp)
                        if int(check_age) > int(expiration_timeout):
                            check_expired = True
                            if settings.ENABLE_CRUCIBLE_DEBUG:
                                logger.info('the check is older than EXPIRATION_TIME for the metric - not checking - check_expired')

        cache_key = 'crucible.last_check.%s.%s' % (source_app, metric)
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('cache_key - crucible.last_check.%s.%s' % (source_app, metric))

        # Only use the cache_key EXPIRATION_TIME if this is not a request to
        # run_crucible_tests on a timeseries
        if run_crucible_tests == 'False':
            if check_expired:
                logger.info('check_expired - not checking Redis key')
                last_check = True
            else:
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('Checking if cache_key exists')
                try:
                    last_check = self.redis_conn.get(cache_key)
                except Exception as e:
                    logger.error('error :: could not query cache_key for %s - %s - %s' % (alerter, metric, e))
                    logger.info('all anomaly files will be removed')
                    remove_all_anomaly_files = True

            if not last_check:
                try:
                    self.redis_conn.setex(cache_key, expiration_timeout, packb(value))
                    logger.info('set cache_key for %s - %s with timeout of %s' % (source_app, metric, str(expiration_timeout)))
                except Exception as e:
                    logger.error('error :: could not query cache_key for %s - %s - %s' % (alerter, metric, e))
                    logger.info('all anomaly files will be removed')
                    remove_all_anomaly_files = True
            else:
                if check_expired:
                    logger.info('check_expired - all anomaly files will be removed')
                    remove_all_anomaly_files = True
                else:
                    logger.info('cache_key is set and not expired for %s - %s - all anomaly files will be removed' % (source_app, metric))
                    remove_all_anomaly_files = True

        # anomaly dir
        if not os.path.exists(str(anomaly_dir)):
            try:
                mkdir_p(skyline_app, str(anomaly_dir))
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('created anomaly dir - %s' % str(anomaly_dir))
            except:
                logger.error('error :: failed to create anomaly_dir - %s' % str(anomaly_dir))

        if not os.path.exists(str(anomaly_dir)):
            logger.error('error :: anomaly_dir does not exist')
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return
        else:
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('anomaly dir exists - %s' % str(anomaly_dir))

        failed_check_file = anomaly_dir + '/' + metric_timestamp + '.failed.check.' + metric + '.txt'
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('failed_check_file - %s' % str(failed_check_file))

        # Retrieve data from graphite is necessary
        anomaly_graph = anomaly_dir + '/' + metric + '.png'
        anomaly_json = anomaly_dir + '/' + metric + '.json'
        anomaly_json_gz = anomaly_json + '.gz'
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('anomaly_graph - %s' % str(anomaly_graph))
            logger.info('anomaly_json - %s' % str(anomaly_json))
            logger.info('anomaly_json_gz - %s' % str(anomaly_json_gz))

        # Some things added to crucible may not be added by a skyline app per se
        # and if run_crucible_tests is string True then no anomaly files should
        # be removed.
        if run_crucible_tests == 'True':
            remove_all_anomaly_files = False

        # Remove check and anomaly files if the metric has a EXPIRATION_TIME
        # cache_key set
        if remove_all_anomaly_files:
            if os.path.isfile(anomaly_graph):
                try:
                    os.remove(anomaly_graph)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_graph removed - %s' % str(anomaly_graph))
                except OSError:
                    pass
            if os.path.isfile(anomaly_json):
                try:
                    os.remove(anomaly_json)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_json removed - %s' % str(anomaly_json))
                except OSError:
                    pass
            if os.path.isfile(anomaly_json_gz):
                try:
                    os.remove(anomaly_json_gz)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_json_gz removed - %s' % str(anomaly_json_gz))
                except OSError:
                    pass

            anomaly_txt_file = anomaly_dir + '/' + metric + '.txt'
            if os.path.isfile(anomaly_txt_file):
                try:
                    os.remove(anomaly_txt_file)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_txt_file removed - %s' % str(anomaly_txt_file))
                except OSError:
                    pass

            # TBD - this data would have to be added to the panaorama DB before
            #       it is removed
            if os.path.isfile(str(metric_check_file)):
                try:
                    os.remove(str(metric_check_file))
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('metric_check_file removed - %s' % str(metric_check_file))
                except OSError:
                    pass

            if os.path.exists(str(anomaly_dir)):
                try:
                    os.rmdir(str(anomaly_dir))
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('anomaly_dir removed - %s' % str(anomaly_dir))
                except OSError:
                    pass

            logger.info('check and anomaly files removed')
            return

        # Check if the image exists
        if graphite_metric == 'True':

            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('graphite_metric - %s' % (graphite_metric))

            # Graphite timeouts
            connect_timeout = int(settings.GRAPHITE_CONNECT_TIMEOUT)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('connect_timeout - %s' % str(connect_timeout))

            read_timeout = int(settings.GRAPHITE_READ_TIMEOUT)
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('read_timeout - %s' % str(read_timeout))

            graphite_until = datetime.datetime.fromtimestamp(int(metric_timestamp)).strftime('%H:%M_%Y%m%d')
            graphite_from = datetime.datetime.fromtimestamp(int(from_timestamp)).strftime('%H:%M_%Y%m%d')

            # graphite URL
            if settings.GRAPHITE_PORT != '':
                url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + ':' + settings.GRAPHITE_PORT + '/render/?from=' + graphite_from + '&until=' + graphite_until + '&target=' + metric + '&format=json'
            else:
                url = settings.GRAPHITE_PROTOCOL + '://' + settings.GRAPHITE_HOST + '/render/?from=' + graphite_from + '&until=' + graphite_until + '&target=' + metric + '&format=json'
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('graphite url - %s' % (url))

            if not os.path.isfile(anomaly_graph):
                logger.info('retrieving png - surfacing %s graph from graphite from %s to %s' % (metric, graphite_from, graphite_until))

                image_url = url.replace('&format=json', '')
                graphite_image_file = anomaly_dir + '/' + metric + '.png'
                if 'width' not in image_url:
                    image_url += '&width=586'
                if 'height' not in image_url:
                    image_url += '&height=308'
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('graphite image url - %s' % (image_url))
                image_url_timeout = int(connect_timeout)

                image_data = None

                try:
                    image_data = urllib2.urlopen(image_url, timeout=image_url_timeout).read()
                    logger.info('url OK - %s' % (image_url))
                except urllib2.URLError:
                    image_data = None
                    logger.error('error :: url bad - %s' % (image_url))

                if image_data is not None:
                    with open(graphite_image_file, 'w') as f:
                        f.write(image_data)
                    logger.info('retrieved - %s' % (anomaly_graph))
                    if python_version == 2:
                        mode_arg = int('0644')
                    if python_version == 3:
                        mode_arg = '0o644'
                    os.chmod(graphite_image_file, mode_arg)
                else:
                    logger.error('error :: failed to retrieved - %s' % (anomaly_graph))
            else:
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('anomaly_graph file exists - %s' % str(anomaly_graph))

            if not os.path.isfile(anomaly_graph):
                logger.error('error :: retrieve failed to surface %s graph from graphite' % (metric))
            else:
                logger.info('graph image exists - %s' % (anomaly_graph))

            # Check if the json exists
            if not os.path.isfile(anomaly_json_gz):
                if not os.path.isfile(anomaly_json):
                    logger.info('surfacing timeseries data for %s from graphite from %s to %s' % (metric, graphite_from, graphite_until))
                    if requests.__version__ >= '2.4.0':
                        use_timeout = (int(connect_timeout), int(read_timeout))
                    else:
                        use_timeout = int(connect_timeout)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('use_timeout - %s' % (str(use_timeout)))

                    try:
                        r = requests.get(url, timeout=use_timeout)
                        js = r.json()
                        datapoints = js[0]['datapoints']
                        if settings.ENABLE_CRUCIBLE_DEBUG:
                            logger.info('data retrieved OK')
                    except:
                        datapoints = [[None, int(graphite_until)]]
                        logger.error('error :: data retrieval failed')

                    converted = []
                    for datapoint in datapoints:
                        try:
                            new_datapoint = [float(datapoint[1]), float(datapoint[0])]
                            converted.append(new_datapoint)
                        except:
                            continue

                    with open(anomaly_json, 'w') as f:
                        f.write(json.dumps(converted))
                    if python_version == 2:
                        mode_arg = int('0644')
                    if python_version == 3:
                        mode_arg = '0o644'
                    os.chmod(anomaly_json, mode_arg)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('json file - %s' % anomaly_json)

                if not os.path.isfile(anomaly_json):
                    logger.error('error :: failed to surface %s json from graphite' % (metric))
                    # Move metric check file
                    try:
                        shutil.move(metric_check_file, failed_check_file)
                        logger.info('moved check file to - %s' % failed_check_file)
                    except OSError:
                        logger.error('error :: failed to move check file to - %s' % failed_check_file)
                        pass
                    return

        # Check timeseries json exists - raw or gz
        if not os.path.isfile(anomaly_json):
            if not os.path.isfile(anomaly_json_gz):
                logger.error('error :: no json data found' % (metric))
                # Move metric check file
                try:
                    shutil.move(metric_check_file, failed_check_file)
                    logger.info('moved check file to - %s' % failed_check_file)
                except OSError:
                    logger.error('error :: failed to move check file to - %s' % failed_check_file)
                    pass
                return
            else:
                logger.info('timeseries json gzip exists - %s' % (anomaly_json_gz))
        else:
            logger.info('timeseries json exists - %s' % (anomaly_json))

        # If timeseries json and run_crucible_tests is str(False) gzip and
        # return here as there is nothing further to do
        if run_crucible_tests == 'False':
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('run_crucible_tests - %s' % run_crucible_tests)
            # gzip the json timeseries data
            if os.path.isfile(anomaly_json):
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('gzipping - %s' % anomaly_json)
                try:
                    f_in = open(anomaly_json)
                    f_out = gzip.open(anomaly_json_gz, 'wb')
                    f_out.writelines(f_in)
                    f_out.close()
                    f_in.close()
                    os.remove(anomaly_json)
                    if python_version == 2:
                        mode_arg = int('0644')
                    if python_version == 3:
                        mode_arg = '0o644'
                    os.chmod(anomaly_json_gz, mode_arg)
                    if settings.ENABLE_CRUCIBLE_DEBUG:
                        logger.info('gzipped - %s' % anomaly_json_gz)
                    try:
                        os.remove(metric_check_file)
                        logger.info('removed check file - %s' % metric_check_file)
                    except OSError:
                        pass
                    return
                except:
                    logger.error('error :: Failed to gzip data file - %s' % str(traceback.print_exc()))
                    try:
                        os.remove(metric_check_file)
                        logger.info('removed check file - %s' % metric_check_file)
                    except OSError:
                        pass
                    return

            if os.path.isfile(anomaly_json_gz):
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('gzip exists - %s' % anomaly_json)
                    try:
                        os.remove(metric_check_file)
                        logger.info('removed check file - %s' % metric_check_file)
                    except OSError:
                        pass
                    return
            nothing_to_do = 'true - for debug only'

        # self.check_if_parent_is_alive()
        # Run crucible algorithms
        logger.info('running crucible tests - %s' % (metric))

        timeseries_dir = metric.replace('.', '/')

        if os.path.isfile(anomaly_json_gz):
            if not os.path.isfile(anomaly_json):
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('ungzipping - %s' % anomaly_json_gz)
                try:
                    # with gzip.open(anomaly_json_gz, 'rb') as fr:
                    fr = gzip.open(anomaly_json_gz, 'rb')
                    raw_timeseries = fr.read()
                    fr.close()
                except Exception as e:
                    logger.error('error :: could not ungzip %s - %s' % (anomaly_json_gz, e))
                    traceback.print_exc()
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('ungzipped')
                    logger.info('writing to - %s' % anomaly_json)
                with open(anomaly_json, 'w') as fw:
                    fw.write(raw_timeseries)
                if settings.ENABLE_CRUCIBLE_DEBUG:
                    logger.info('anomaly_json done')
                if python_version == 2:
                    mode_arg = int('0644')
                if python_version == 3:
                    mode_arg = '0o644'
                os.chmod(anomaly_json, mode_arg)
        else:
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('No gzip - %s' % anomaly_json_gz)
            nothing_to_do = 'true - for debug only'

        if os.path.isfile(anomaly_json):
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('anomaly_json exists - %s' % anomaly_json)

        if os.path.isfile(anomaly_json):
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('loading timeseries from - %s' % anomaly_json)
            with open(anomaly_json, 'r') as f:
                timeseries = json.loads(f.read())
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('loaded timeseries from - %s' % anomaly_json)
        else:
            try:
                logger.error('error :: file not found - %s' % anomaly_json)
                shutil.move(metric_check_file, failed_check_file)
                if python_version == 2:
                    mode_arg = int('0644')
                if python_version == 3:
                    mode_arg = '0o644'
                os.chmod(failed_check_file, mode_arg)
                logger.info('moved check file to - %s' % failed_check_file)
            except OSError:
                logger.error('error :: failed to move check file to - %s' % failed_check_file)
                pass
            return

        start_timestamp = int(timeseries[0][0])
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('start_timestamp - %s' % str(start_timestamp))
        end_timestamp = int(timeseries[-1][0])
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('end_timestamp - %s' % str(end_timestamp))

        full_duration = end_timestamp - start_timestamp
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('full_duration - %s' % str(full_duration))

        self.check_if_parent_is_alive()

        run_algorithms_start_timestamp = int(time())
        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('run_algorithms_start_timestamp - %s' % str(run_algorithms_start_timestamp))

        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('run_algorithms - %s,%s,%s,%s,%s,%s' % (metric, str(end_timestamp), str(full_duration), anomaly_json, skyline_app, str(algorithms)))
        try:
            anomalous, ensemble = run_algorithms(timeseries, str(metric), end_timestamp, full_duration, str(anomaly_json), skyline_app, algorithms)
        except:
            logger.error('error :: run_algorithms failed - %s' % str(traceback.print_exc()))

        run_algorithms_end_timestamp = int(time())
        run_algorithms_seconds = run_algorithms_end_timestamp - run_algorithms_start_timestamp

        if settings.ENABLE_CRUCIBLE_DEBUG:
            logger.info('anomalous, ensemble - %s, %s' % (anomalous, str(ensemble)))

        if anomalous:
            if settings.ENABLE_CRUCIBLE_DEBUG:
                logger.info('anomalous - %s' % (anomalous))
            nothing_to_do = 'true - for debug only'

        logger.info('run_algorithms took %s seconds' % str(run_algorithms_seconds))

        # Update anomaly file
        crucible_data = 'crucible_tests_run = "%s"\n' \
                        'crucible_triggered_algorithms = %s\n' \
                        'tested_by = "%s"\n' \
            % (str(run_timestamp), str(ensemble), str(this_host))
        crucible_anomaly_file = '%s/%s.txt' % (anomaly_dir, metric)
        with open(crucible_anomaly_file, 'a') as fh:
            fh.write(crucible_data)
        if python_version == 2:
            mode_arg = int('0644')
        if python_version == 3:
            mode_arg = '0o644'
        os.chmod(crucible_anomaly_file, mode_arg)
        logger.info('updated crucible anomaly file - %s/%s.txt' % (anomaly_dir, metric))

        # gzip the json timeseries data after analysis
        if os.path.isfile(anomaly_json):
            if not os.path.isfile(anomaly_json_gz):
                try:
                    f_in = open(anomaly_json)
                    f_out = gzip.open(anomaly_json_gz, 'wb')
                    f_out.writelines(f_in)
                    f_out.close()
                    f_in.close()
                    os.remove(anomaly_json)
                    if python_version == 2:
                        mode_arg = int('0644')
                    if python_version == 3:
                        mode_arg = '0o644'
                    os.chmod(anomaly_json_gz, mode_arg)
                    logger.info('gzipped - %s' % (anomaly_json_gz))
                except:
                    logger.error('error :: Failed to gzip data file - %s' % str(traceback.print_exc()))
            else:
                os.remove(anomaly_json)

        if run_script:
            if os.path.isfile(run_script):
                logger.info('running - %s' % (run_script))
                os.system('%s %s' % (str(run_script), str(crucible_anomaly_file)))

        # Remove metric check file
        try:
            os.remove(metric_check_file)
            logger.info('complete removed check file - %s' % (metric_check_file))
        except OSError:
            pass
Ejemplo n.º 3
0
    def run(self):
        """
        Called when the process intializes.

        Determine if what is known in the Skyline DB
        blah

        """

        # Log management to prevent overwriting
        # Allow the bin/<skyline_app>.d to manage the log
        if os.path.isfile(skyline_app_logwait):
            try:
                logger.info('removing %s' % skyline_app_logwait)
                os.remove(skyline_app_logwait)
            except OSError:
                logger.error('error :: failed to remove %s, continuing' % skyline_app_logwait)
                pass

        now = time()
        log_wait_for = now + 5
        while now < log_wait_for:
            if os.path.isfile(skyline_app_loglock):
                sleep(.1)
                now = time()
            else:
                now = log_wait_for + 1

        logger.info('starting %s run' % skyline_app)
        if os.path.isfile(skyline_app_loglock):
            logger.error('error :: bin/%s.d log management seems to have failed, continuing' % skyline_app)
            try:
                os.remove(skyline_app_loglock)
                logger.info('log lock file removed')
            except OSError:
                logger.error('error :: failed to remove %s, continuing' % skyline_app_loglock)
                pass
        else:
            logger.info('bin/%s.d log management done' % skyline_app)

        # See if I am known in the DB, if so, what are my variables
        # self.populate mysql
        # What is my host id in the Skyline panorama DB?
        #   - if not known - INSERT hostname INTO hosts
        # What are the known apps?
        #   - if returned make a dictionary
        # What are the known algorithms?
        #   - if returned make a dictionary

        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
                if ENABLE_PANORAMA_DEBUG:
                    logger.info('debug :: connected to Redis')
            except:
                logger.error('error :: cannot connect to redis at socket path %s' % (
                    settings.REDIS_SOCKET_PATH))
                sleep(30)
                self.redis_conn = StrictRedis(unix_socket_path=settings.REDIS_SOCKET_PATH)
                continue

            # Report app up
            try:
                self.redis_conn.setex(skyline_app, 120, now)
                logger.info('updated Redis key for %s up' % skyline_app)
            except:
                logger.error('error :: failed to update Redis key for %s up' % skyline_app)

            if ENABLE_PANORAMA_DEBUG:
                # Make sure mysql is available
                mysql_down = True
                while mysql_down:

                    query = 'SHOW TABLES'
                    results = self.mysql_select(query)

                    if results:
                        mysql_down = False
                        logger.info('debug :: tested database query - OK')
                    else:
                        logger.error('error :: failed to query database')
                        sleep(30)

            if ENABLE_PANORAMA_DEBUG:
                try:
                    query = 'SELECT id, test FROM test'
                    result = self.mysql_select(query)
                    logger.info('debug :: tested mysql SELECT query - OK')
                    logger.info('debug :: result: %s' % str(result))
                    logger.info('debug :: result[0]: %s' % str(result[0]))
                    logger.info('debug :: result[1]: %s' % str(result[1]))
# Works
# 2016-06-10 19:07:23 :: 4707 :: result: [(1, u'test1')]
                except:
                    logger.error(
                        'error :: mysql error - %s' %
                        traceback.print_exc())
                    logger.error('error :: failed to SELECT')

            # self.populate the database metatdata tables
            # What is my host id in the Skyline panorama DB?
            host_id = False
            # @modified 20170913 - Task #2160: Test skyline with bandit
            # Added nosec to exclude from bandit tests
            query = 'select id FROM hosts WHERE host=\'%s\'' % this_host  # nosec
            results = self.mysql_select(query)
            if results:
                host_id = results[0][0]
                logger.info('host_id: %s' % str(host_id))
            else:
                logger.info('failed to determine host id of %s' % this_host)

            #   - if not known - INSERT hostname INTO host
            if not host_id:
                logger.info('inserting %s into hosts table' % this_host)
                # @modified 20170913 - Task #2160: Test skyline with bandit
                # Added nosec to exclude from bandit tests
                query = 'insert into hosts (host) VALUES (\'%s\')' % this_host  # nosec
                host_id = self.mysql_insert(query)
                if host_id:
                    logger.info('new host_id: %s' % str(host_id))

            if not host_id:
                logger.error(
                    'error :: failed to determine populate %s into the hosts table' %
                    this_host)
                sleep(30)
                continue

            # Like loop through the panorama dir and see if anyone has left you
            # any work, etc
            # Make sure check_dir exists and has not been removed
            try:
                if settings.ENABLE_PANORAMA_DEBUG:
                    logger.info('debug :: checking check dir exists - %s' % settings.PANORAMA_CHECK_PATH)
                os.path.exists(settings.PANORAMA_CHECK_PATH)
            except:
                logger.error('error :: check dir did not exist - %s' % settings.PANORAMA_CHECK_PATH)
                mkdir_p(settings.PANORAMA_CHECK_PATH)

                logger.info('check dir created - %s' % settings.PANORAMA_CHECK_PATH)
                os.path.exists(settings.PANORAMA_CHECK_PATH)
                # continue

            """
            Determine if any metric has been added to add
            """
            while True:
                metric_var_files = False
                try:
                    metric_var_files = [f for f in listdir(settings.PANORAMA_CHECK_PATH) if isfile(join(settings.PANORAMA_CHECK_PATH, f))]
                except:
                    logger.error('error :: failed to list files in check dir')
                    logger.info(traceback.format_exc())

                if not metric_var_files:
                    logger.info('sleeping 20 no metric check files')
                    sleep(20)

                # Discover metric anomalies to insert
                metric_var_files = False
                try:
                    metric_var_files = [f for f in listdir(settings.PANORAMA_CHECK_PATH) if isfile(join(settings.PANORAMA_CHECK_PATH, f))]
                except:
                    logger.error('error :: failed to list files in check dir')
                    logger.info(traceback.format_exc())

                if metric_var_files:
                    break

            metric_var_files_sorted = sorted(metric_var_files)
            metric_check_file = '%s/%s' % (settings.PANORAMA_CHECK_PATH, str(metric_var_files_sorted[0]))

            logger.info('assigning anomaly for insertion - %s' % str(metric_var_files_sorted[0]))

            # Spawn processes
            pids = []
            spawned_pids = []
            pid_count = 0
            now = time()
            for i in range(1, settings.PANORAMA_PROCESSES + 1):
                try:
                    p = Process(target=self.spin_process, args=(i, metric_check_file))
                    pids.append(p)
                    pid_count += 1
                    logger.info('starting %s of %s spin_process/es' % (str(pid_count), str(settings.PANORAMA_PROCESSES)))
                    p.start()
                    spawned_pids.append(p.pid)
                except:
                    logger.error('error :: to start spin_process')
                    logger.info(traceback.format_exc())
                    continue

            # Send wait signal to zombie processes
            # for p in pids:
            #     p.join()
            # Self monitor processes and terminate if any spin_process has run
            # for longer than CRUCIBLE_TESTS_TIMEOUT
            p_starts = time()
            while time() - p_starts <= 20:
                if any(p.is_alive() for p in pids):
                    # Just to avoid hogging the CPU
                    sleep(.1)
                else:
                    # All the processes are done, break now.
                    time_to_run = time() - p_starts
                    logger.info(
                        '%s :: %s spin_process/es completed in %.2f seconds' % (
                            skyline_app, str(settings.PANORAMA_PROCESSES),
                            time_to_run))
                    break
            else:
                # We only enter this if we didn't 'break' above.
                logger.info('%s :: timed out, killing all spin_process processes' % (skyline_app))
                for p in pids:
                    p.terminate()
                    # p.join()

                check_file_name = os.path.basename(str(metric_check_file))
                if settings.ENABLE_PANORAMA_DEBUG:
                    logger.info('debug :: check_file_name - %s' % check_file_name)
                check_file_timestamp = check_file_name.split('.', 1)[0]
                if settings.ENABLE_PANORAMA_DEBUG:
                    logger.info('debug :: check_file_timestamp - %s' % str(check_file_timestamp))
                check_file_metricname_txt = check_file_name.split('.', 1)[1]
                if settings.ENABLE_PANORAMA_DEBUG:
                    logger.info('debug :: check_file_metricname_txt - %s' % check_file_metricname_txt)
                check_file_metricname = check_file_metricname_txt.replace('.txt', '')
                if settings.ENABLE_PANORAMA_DEBUG:
                    logger.info('debug :: check_file_metricname - %s' % check_file_metricname)
                check_file_metricname_dir = check_file_metricname.replace('.', '/')
                if settings.ENABLE_PANORAMA_DEBUG:
                    logger.info('debug :: check_file_metricname_dir - %s' % check_file_metricname_dir)

                metric_failed_check_dir = '%s/%s/%s' % (failed_checks_dir, check_file_metricname_dir, check_file_timestamp)

                fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))

            for p in pids:
                if p.is_alive():
                    logger.info('%s :: stopping spin_process - %s' % (skyline_app, str(p.is_alive())))
                    p.join()
Ejemplo n.º 4
0
    def spin_process(self, i, metric_check_file):
        """
        Assign a metric anomaly to process.

        :param i: python process id
        :param metric_check_file: full path to the metric check file

        :return: returns True

        """

        child_process_pid = os.getpid()
        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: child_process_pid - %s' % str(child_process_pid))

        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: processing metric check - %s' % metric_check_file)

        if not os.path.isfile(str(metric_check_file)):
            logger.error('error :: file not found - metric_check_file - %s' % (str(metric_check_file)))
            return

        check_file_name = os.path.basename(str(metric_check_file))
        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: check_file_name - %s' % check_file_name)
        check_file_timestamp = check_file_name.split('.', 1)[0]
        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: check_file_timestamp - %s' % str(check_file_timestamp))
        check_file_metricname_txt = check_file_name.split('.', 1)[1]
        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: check_file_metricname_txt - %s' % check_file_metricname_txt)
        check_file_metricname = check_file_metricname_txt.replace('.txt', '')
        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: check_file_metricname - %s' % check_file_metricname)
        check_file_metricname_dir = check_file_metricname.replace('.', '/')
        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: check_file_metricname_dir - %s' % check_file_metricname_dir)

        metric_failed_check_dir = '%s/%s/%s' % (failed_checks_dir, check_file_metricname_dir, check_file_timestamp)

        failed_check_file = '%s/%s' % (metric_failed_check_dir, check_file_name)
        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: failed_check_file - %s' % failed_check_file)

        # Load and validate metric variables
        try:
            # @modified 20170101 - Feature #1830: Ionosphere alerts
            #                      Bug #1460: panorama check file fails
            #                      Panorama check file fails #24
            # Get rid of the skyline_functions imp as imp is deprecated in py3 anyway
            # Use def new_load_metric_vars(self, metric_vars_file):
            # metric_vars = load_metric_vars(skyline_app, str(metric_check_file))
            metric_vars_array = self.new_load_metric_vars(str(metric_check_file))
        except:
            logger.info(traceback.format_exc())
            logger.error('error :: failed to load metric variables from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        # Test metric variables
        # We use a pythonic methodology to test if the variables are defined,
        # this ensures that if any of the variables are not set for some reason
        # we can handle unexpected data or situations gracefully and try and
        # ensure that the process does not hang.
        metric = None
        try:
            # metric_vars.metric
            # metric = str(metric_vars.metric)
            key = 'metric'
            value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key]
            metric = str(value_list[0])
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - metric - %s' % metric)
        except:
            logger.error('error :: failed to read metric variable from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        if not metric:
            logger.error('error :: failed to load metric variable from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        value = None
        # @added 20171214 - Bug #2234: panorama metric_vars value check
        value_valid = None
        try:
            # metric_vars.value
            # value = str(metric_vars.value)
            key = 'value'
            value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key]
            value = float(value_list[0])
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - value - %s' % (value))
            # @added 20171214 - Bug #2234: panorama metric_vars value check
            value_valid = True
        except:
            logger.error('error :: failed to read value variable from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        # @added 20171214 - Bug #2234: panorama metric_vars value check
        # If value was float of 0.0 then this was interpolated as not set
        # if not value:
        if not value_valid:
            # @added 20171214 - Bug #2234: panorama metric_vars value check
            # Added exception handling here
            logger.info(traceback.format_exc())
            logger.error('error :: failed to read value variable from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        from_timestamp = None
        try:
            # metric_vars.from_timestamp
            # from_timestamp = str(metric_vars.from_timestamp)
            key = 'from_timestamp'
            value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key]
            from_timestamp = int(value_list[0])
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - from_timestamp - %s' % from_timestamp)
        except:
            # @added 20160822 - Bug #1460: panorama check file fails
            # Added exception handling here
            logger.info(traceback.format_exc())
            logger.error('error :: failed to read from_timestamp variable from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        if not from_timestamp:
            logger.error('error :: failed to load from_timestamp variable from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        metric_timestamp = None
        try:
            # metric_vars.metric_timestamp
            # metric_timestamp = str(metric_vars.metric_timestamp)
            key = 'metric_timestamp'
            value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key]
            metric_timestamp = int(value_list[0])
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - metric_timestamp - %s' % metric_timestamp)
        except:
            logger.error('error :: failed to read metric_timestamp variable from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        if not metric_timestamp:
            logger.error('error :: failed to load metric_timestamp variable from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        algorithms = None
        try:
            # metric_vars.algorithms
            # algorithms = metric_vars.algorithms
            key = 'algorithms'
            value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key]
            algorithms = value_list[0]
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - algorithms - %s' % str(algorithms))
        except:
            logger.error('error :: failed to read algorithms variable from check file setting to all - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        if not algorithms:
            logger.error('error :: failed to load algorithms variable from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        triggered_algorithms = None
        try:
            # metric_vars.triggered_algorithms
            # triggered_algorithms = metric_vars.triggered_algorithms
            key = 'triggered_algorithms'
            value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key]
            triggered_algorithms = value_list[0]
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - triggered_algorithms - %s' % str(triggered_algorithms))
        except:
            logger.error('error :: failed to read triggered_algorithms variable from check file setting to all - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        if not triggered_algorithms:
            logger.error('error :: failed to load triggered_algorithms variable from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        app = None
        try:
            # metric_vars.app
            # app = str(metric_vars.app)
            key = 'app'
            value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key]
            app = str(value_list[0])
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - app - %s' % app)
        except:
            logger.error('error :: failed to read app variable from check file setting to all  - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        if not app:
            logger.error('error :: failed to load app variable from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        source = None
        try:
            # metric_vars.source
            # source = str(metric_vars.source)
            key = 'source'
            value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key]
            source = str(value_list[0])
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - source - %s' % source)
        except:
            logger.error('error :: failed to read source variable from check file setting to all  - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        if not app:
            logger.error('error :: failed to load app variable from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        added_by = None
        try:
            # metric_vars.added_by
            # added_by = str(metric_vars.added_by)
            key = 'added_by'
            value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key]
            added_by = str(value_list[0])
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - added_by - %s' % added_by)
        except:
            logger.error('error :: failed to read added_by variable from check file setting to all - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        if not added_by:
            logger.error('error :: failed to load added_by variable from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        added_at = None
        try:
            # metric_vars.added_at
            # added_at = str(metric_vars.added_at)
            key = 'added_at'
            value_list = [var_array[1] for var_array in metric_vars_array if var_array[0] == key]
            added_at = str(value_list[0])
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - added_at - %s' % added_at)
        except:
            logger.error('error :: failed to read added_at variable from check file setting to all - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        if not added_at:
            logger.error('error :: failed to load added_at variable from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        record_anomaly = True
        cache_key = '%s.last_check.%s.%s' % (skyline_app, app, metric)
        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: cache_key - %s.last_check.%s.%s' % (
                skyline_app, app, metric))
        try:
            last_check = self.redis_conn.get(cache_key)
        except Exception as e:
            logger.error(
                'error :: could not query cache_key - %s.last_check.%s.%s - %s' % (
                    skyline_app, app, metric, e))
            last_check = None

        if last_check:
            record_anomaly = False
            logger.info(
                'Panorama metric key not expired - %s.last_check.%s.%s' % (
                    skyline_app, app, metric))

        # @added 20160907 - Handle Panorama stampede on restart after not running #26
        # Allow to expire check if greater than PANORAMA_CHECK_MAX_AGE
        if max_age:
            now = time()
            anomaly_age = int(now) - int(metric_timestamp)
            if anomaly_age > max_age_seconds:
                record_anomaly = False
                logger.info(
                    'Panorama check max age exceeded - %s - %s seconds old, older than %s seconds discarding' % (
                        metric, str(anomaly_age), str(max_age_seconds)))

        if not record_anomaly:
            logger.info('not recording anomaly for - %s' % (metric))
            if os.path.isfile(str(metric_check_file)):
                try:
                    os.remove(str(metric_check_file))
                    logger.info('metric_check_file removed - %s' % str(metric_check_file))
                except OSError:
                    pass

            return

        # Determine id of something thing
        def determine_id(table, key, value):
            """
            Get the id of something from Redis or the database and insert a new
            record if one does not exist for the value.

            :param table: table name
            :param key: key name
            :param value: value name
            :type table: str
            :type key: str
            :type value: str
            :return: int or boolean

            """

            query_cache_key = '%s.mysql_ids.%s.%s.%s' % (skyline_app, table, key, value)
            determined_id = None
            redis_determined_id = None
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: query_cache_key - %s' % (query_cache_key))

            try:
                redis_known_id = self.redis_conn.get(query_cache_key)
            except:
                redis_known_id = None

            if redis_known_id:
                unpacker = Unpacker(use_list=False)
                unpacker.feed(redis_known_id)
                redis_determined_id = list(unpacker)

            if redis_determined_id:
                determined_id = int(redis_determined_id[0])

            if determined_id:
                if determined_id > 0:
                    return determined_id

            # Query MySQL
            # @modified 20170913 - Task #2160: Test skyline with bandit
            # Added nosec to exclude from bandit tests
            query = 'select id FROM %s WHERE %s=\'%s\'' % (table, key, value)  # nosec

            # @modified 20170916 - Bug #2166: panorama incorrect mysql_id cache keys
            # Wrap in except
            # results = self.mysql_select(query)
            results = None
            try:
                results = self.mysql_select(query)
            except:
                logger.error('error :: failed to determine results from - %s' % (query))

            determined_id = 0
            if results:
                try:
                    determined_id = int(results[0][0])
                except Exception as e:
                    logger.error(traceback.format_exc())
                    logger.error('error :: determined_id is not an int')
                    determined_id = 0

            if determined_id > 0:
                # Set the key for a week
                if not redis_determined_id:
                    try:
                        self.redis_conn.setex(query_cache_key, 604800, packb(determined_id))
                        logger.info('set redis query_cache_key - %s - id: %s' % (
                            query_cache_key, str(determined_id)))
                    except Exception as e:
                        logger.error(traceback.format_exc())
                        logger.error('error :: failed to set query_cache_key - %s - id: %s' % (
                            query_cache_key, str(determined_id)))
                return int(determined_id)

            # @added 20170115 - Feature #1854: Ionosphere learn - generations
            # Added determination of the learn related variables
            # learn_full_duration_days, learn_valid_ts_older_than,
            # max_generations and max_percent_diff_from_origin value to the
            # insert statement if the table is the metrics table.
            if table == 'metrics' and key == 'metric':
                # Set defaults
                learn_full_duration_days = int(settings.IONOSPHERE_LEARN_DEFAULT_FULL_DURATION_DAYS)
                valid_learning_duration = int(settings.IONOSPHERE_LEARN_DEFAULT_VALID_TIMESERIES_OLDER_THAN_SECONDS)
                max_generations = int(settings.IONOSPHERE_LEARN_DEFAULT_MAX_GENERATIONS)
                max_percent_diff_from_origin = float(settings.IONOSPHERE_LEARN_DEFAULT_MAX_PERCENT_DIFF_FROM_ORIGIN)
                try:
                    use_full_duration, valid_learning_duration, use_full_duration_days, max_generations, max_percent_diff_from_origin = get_ionosphere_learn_details(skyline_app, value)
                    learn_full_duration_days = use_full_duration_days
                except:
                    logger.error(traceback.format_exc())
                    logger.error('error :: failed to get_ionosphere_learn_details for %s' % value)

                logger.info('metric learn details determined for %s' % value)
                logger.info('learn_full_duration_days     :: %s days' % (str(learn_full_duration_days)))
                logger.info('valid_learning_duration      :: %s seconds' % (str(valid_learning_duration)))
                logger.info('max_generations              :: %s' % (str(max_generations)))
                logger.info('max_percent_diff_from_origin :: %s' % (str(max_percent_diff_from_origin)))

            # INSERT because no known id
            # @modified 20170115 - Feature #1854: Ionosphere learn - generations
            # Added the learn_full_duration_days, learn_valid_ts_older_than,
            # max_generations and max_percent_diff_from_origin value to the
            # insert statement if the table is the metrics table.
            # insert_query = 'insert into %s (%s) VALUES (\'%s\')' % (table, key, value)
            if table == 'metrics' and key == 'metric':
                # @modified 20170913 - Task #2160: Test skyline with bandit
                # Added nosec to exclude from bandit tests
                insert_query_string = '%s (%s, learn_full_duration_days, learn_valid_ts_older_than, max_generations, max_percent_diff_from_origin) VALUES (\'%s\', %s, %s, %s, %s)' % (
                    table, key, value, str(learn_full_duration_days),
                    str(valid_learning_duration), str(max_generations),
                    str(max_percent_diff_from_origin))
                insert_query = 'insert into %s' % insert_query_string  # nosec
            else:
                insert_query = 'insert into %s (%s) VALUES (\'%s\')' % (table, key, value)  # nosec

            logger.info('inserting %s into %s table' % (value, table))
            try:
                results = self.mysql_insert(insert_query)
            except:
                logger.error(traceback.format_exc())
                logger.error('error :: failed to determine the id of %s from the insert' % (value))
                raise

            determined_id = 0
            if results:
                determined_id = int(results)
            else:
                logger.error('error :: results not set')
                raise

            if determined_id > 0:
                # Set the key for a week
                if not redis_determined_id:
                    try:
                        self.redis_conn.setex(query_cache_key, 604800, packb(determined_id))
                        logger.info('set redis query_cache_key - %s - id: %s' % (
                            query_cache_key, str(determined_id)))
                    except Exception as e:
                        logger.error(traceback.format_exc())
                        logger.error('%s' % str(e))
                        logger.error('error :: failed to set query_cache_key - %s - id: %s' % (
                            query_cache_key, str(determined_id)))
                return determined_id

            logger.error('error :: failed to determine the inserted id for %s' % value)
            return False

        try:
            added_by_host_id = determine_id('hosts', 'host', added_by)
        except:
            logger.error('error :: failed to determine id of %s' % (added_by))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return False

        try:
            app_id = determine_id('apps', 'app', app)
        except:
            logger.error('error :: failed to determine id of %s' % (app))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return False

        try:
            source_id = determine_id('sources', 'source', source)
        except:
            logger.error('error :: failed to determine id of %s' % (source))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return False

        try:
            metric_id = determine_id('metrics', 'metric', metric)
        except:
            logger.error('error :: failed to determine id of %s' % (metric))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return False

        algorithms_ids_csv = ''
        for algorithm in algorithms:
            try:
                algorithm_id = determine_id('algorithms', 'algorithm', algorithm)
            except:
                logger.error('error :: failed to determine id of %s' % (algorithm))
                fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
                return False
            if algorithms_ids_csv == '':
                algorithms_ids_csv = str(algorithm_id)
            else:
                new_algorithms_ids_csv = '%s,%s' % (algorithms_ids_csv, str(algorithm_id))
                algorithms_ids_csv = new_algorithms_ids_csv

        triggered_algorithms_ids_csv = ''
        for triggered_algorithm in triggered_algorithms:
            try:
                triggered_algorithm_id = determine_id('algorithms', 'algorithm', triggered_algorithm)
            except:
                logger.error('error :: failed to determine id of %s' % (triggered_algorithm))
                fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
                return False
            if triggered_algorithms_ids_csv == '':
                triggered_algorithms_ids_csv = str(triggered_algorithm_id)
            else:
                new_triggered_algorithms_ids_csv = '%s,%s' % (
                    triggered_algorithms_ids_csv, str(triggered_algorithm_id))
                triggered_algorithms_ids_csv = new_triggered_algorithms_ids_csv

        logger.info('inserting anomaly')
        try:
            full_duration = int(metric_timestamp) - int(from_timestamp)
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: full_duration - %s' % str(full_duration))
        except:
            logger.error('error :: failed to determine full_duration')
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return False

        try:
            anomalous_datapoint = round(float(value), 6)
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: anomalous_datapoint - %s' % str(anomalous_datapoint))
        except:
            logger.error('error :: failed to determine anomalous_datapoint')
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return False

        try:
            columns = '%s, %s, %s, %s, %s, %s, %s, %s, %s' % (
                'metric_id', 'host_id', 'app_id', 'source_id',
                'anomaly_timestamp', 'anomalous_datapoint', 'full_duration',
                'algorithms_run', 'triggered_algorithms')
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: columns - %s' % str(columns))
        except:
            logger.error('error :: failed to construct columns string')
            logger.info(traceback.format_exc())
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return False

        try:
            # @modified 20170913 - Task #2160: Test skyline with bandit
            # Added nosec to exclude from bandit tests
            query_string = '(%s) VALUES (%d, %d, %d, %d, %s, %.6f, %d, \'%s\', \'%s\')' % (
                columns, metric_id, added_by_host_id, app_id, source_id,
                metric_timestamp, anomalous_datapoint, full_duration,
                algorithms_ids_csv, triggered_algorithms_ids_csv)
            query = 'insert into anomalies %s' % query_string  # nosec
        except:
            logger.error('error :: failed to construct insert query')
            logger.info(traceback.format_exc())
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return False

        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: anomaly insert - %s' % str(query))

        try:
            anomaly_id = self.mysql_insert(query)
            logger.info('anomaly id - %d - created for %s at %s' % (
                anomaly_id, metric, metric_timestamp))
        except:
            logger.error('error :: failed to insert anomaly %s at %s' % (
                anomaly_id, metric, metric_timestamp))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return False

        # Set anomaly record cache key
        try:
            self.redis_conn.setex(
                cache_key, settings.PANORAMA_EXPIRY_TIME, packb(value))
            logger.info('set cache_key - %s.last_check.%s.%s - %s' % (
                skyline_app, app, metric, str(settings.PANORAMA_EXPIRY_TIME)))
        except Exception as e:
            logger.error(
                'error :: could not query cache_key - %s.last_check.%s.%s - %s' % (
                    skyline_app, app, metric, e))

        if os.path.isfile(str(metric_check_file)):
            try:
                os.remove(str(metric_check_file))
                logger.info('metric_check_file removed - %s' % str(metric_check_file))
            except OSError:
                pass

        return anomaly_id
Ejemplo n.º 5
0
    def spin_process(self, i, metric_check_file):
        """
        Assign a metric anomaly to process.

        :param i: python process id
        :param metric_check_file: full path to the metric check file

        :return: returns True

        """

        child_process_pid = os.getpid()
        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: child_process_pid - %s' % str(child_process_pid))

        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: processing metric check - %s' % metric_check_file)

        if not os.path.isfile(str(metric_check_file)):
            logger.error('error :: file not found - metric_check_file - %s' % (str(metric_check_file)))
            return

        check_file_name = os.path.basename(str(metric_check_file))
        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: check_file_name - %s' % check_file_name)
        check_file_timestamp = check_file_name.split('.', 1)[0]
        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: check_file_timestamp - %s' % str(check_file_timestamp))
        check_file_metricname_txt = check_file_name.split('.', 1)[1]
        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: check_file_metricname_txt - %s' % check_file_metricname_txt)
        check_file_metricname = check_file_metricname_txt.replace('.txt', '')
        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: check_file_metricname - %s' % check_file_metricname)
        check_file_metricname_dir = check_file_metricname.replace('.', '/')
        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: check_file_metricname_dir - %s' % check_file_metricname_dir)

        metric_failed_check_dir = '%s/%s/%s' % (failed_checks_dir, check_file_metricname_dir, check_file_timestamp)

        failed_check_file = '%s/%s' % (metric_failed_check_dir, check_file_name)
        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: failed_check_file - %s' % failed_check_file)

        # Load and validate metric variables
        try:
            metric_vars = load_metric_vars(skyline_app, str(metric_check_file))
        except:
            logger.info(traceback.format_exc())
            logger.error('error :: failed to load metric variables from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        # Test metric variables
        # We use a pythonic methodology to test if the variables are defined,
        # this ensures that if any of the variables are not set for some reason
        # we can handle unexpected data or situations gracefully and try and
        # ensure that the process does not hang.
        try:
            metric_vars.metric
            metric = str(metric_vars.metric)
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - metric - %s' % metric)
        except:
            logger.error('error :: failed to read metric variable from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        try:
            metric_vars.value
            value = str(metric_vars.value)
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - value - %s' % (value))
        except:
            logger.error('error :: failed to read value variable from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        try:
            metric_vars.from_timestamp
            from_timestamp = str(metric_vars.from_timestamp)
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - from_timestamp - %s' % from_timestamp)
        except:
            # @added 20160822 - Bug #1460: panorama check file fails
            # Added exception handling here
            logger.info(traceback.format_exc())
            logger.error('error :: failed to read from_timestamp variable from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        try:
            metric_vars.metric_timestamp
            metric_timestamp = str(metric_vars.metric_timestamp)
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - metric_timestamp - %s' % metric_timestamp)
        except:
            logger.error('error :: failed to read metric_timestamp variable from check file - %s' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        try:
            metric_vars.algorithms
            algorithms = metric_vars.algorithms
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - algorithms - %s' % str(algorithms))
        except:
            logger.error('error :: failed to read algorithms variable from check file setting to all' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        try:
            metric_vars.triggered_algorithms
            triggered_algorithms = metric_vars.triggered_algorithms
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - triggered_algorithms - %s' % str(triggered_algorithms))
        except:
            logger.error('error :: failed to read triggered_algorithms variable from check file setting to all' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        try:
            metric_vars.app
            app = str(metric_vars.app)
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - app - %s' % app)
        except:
            logger.error('error :: failed to read app variable from check file setting to all' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        try:
            metric_vars.source
            source = str(metric_vars.source)
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - source - %s' % source)
        except:
            logger.error('error :: failed to read source variable from check file setting to all' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        try:
            metric_vars.added_by
            added_by = str(metric_vars.added_by)
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - added_by - %s' % added_by)
        except:
            logger.error('error :: failed to read added_by variable from check file setting to all' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        try:
            metric_vars.added_at
            added_at = str(metric_vars.added_at)
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: metric variable - added_at - %s' % added_at)
        except:
            logger.error('error :: failed to read added_at variable from check file setting to all' % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return

        record_anomaly = True
        cache_key = '%s.last_check.%s.%s' % (skyline_app, app, metric)
        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: cache_key - %s.last_check.%s.%s' % (
                skyline_app, app, metric))
        try:
            last_check = self.redis_conn.get(cache_key)
        except Exception as e:
            logger.error(
                'error :: could not query cache_key - %s.last_check.%s.%s - %s' % (
                    skyline_app, app, metric, e))
            last_check = None

        if last_check:
            record_anomaly = False
            logger.info(
                'Panorama metric key not expired - %s.last_check.%s.%s' % (
                    skyline_app, app, metric))

        # @added 20160907 - Handle Panorama stampede on restart after not running #26
        # Allow to expire check if greater than PANORAMA_CHECK_MAX_AGE
        if max_age:
            now = time()
            anomaly_age = int(now) - int(metric_timestamp)
            if anomaly_age > max_age_seconds:
                record_anomaly = False
                logger.info(
                    'Panorama check max age exceeded - %s - %s seconds old, older than %s seconds discarding' % (
                        metric, str(anomaly_age), str(max_age_seconds)))

        if not record_anomaly:
            logger.info('not recording anomaly for - %s' % (metric))
            if os.path.isfile(str(metric_check_file)):
                try:
                    os.remove(str(metric_check_file))
                    logger.info('metric_check_file removed - %s' % str(metric_check_file))
                except OSError:
                    pass

            return

        # Determine id of something thing
        def determine_id(table, key, value):
            """
            Get the id of something from Redis or the database and insert a new
            record if one does not exist for the value.

            :param table: table name
            :param key: key name
            :param value: value name
            :type table: str
            :type key: str
            :type value: str
            :return: int or boolean

            """

            query_cache_key = '%s.mysql_ids.%s.%s.%s' % (skyline_app, table, key, value)
            determined_id = None
            redis_determined_id = None
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: query_cache_key - %s' % (query_cache_key))

            try:
                redis_known_id = self.redis_conn.get(query_cache_key)
            except:
                redis_known_id = None

            if redis_known_id:
                unpacker = Unpacker(use_list=False)
                unpacker.feed(redis_known_id)
                redis_determined_id = list(unpacker)

            if redis_determined_id:
                determined_id = int(redis_determined_id[0])

            if determined_id:
                if determined_id > 0:
                    return determined_id

            # Query MySQL
            query = 'select id FROM %s WHERE %s=\'%s\'' % (table, key, value)
            results = self.mysql_select(query)

            determined_id = 0
            if results:
                determined_id = int(results[0][0])

            if determined_id > 0:
                # Set the key for a week
                if not redis_determined_id:
                    try:
                        self.redis_conn.setex(query_cache_key, 604800, packb(determined_id))
                        logger.info('set redis query_cache_key - %s - id: %s' % (
                            query_cache_key, str(determined_id)))
                    except Exception as e:
                        logger.error(traceback.format_exc())
                        logger.error('error :: failed to set query_cache_key - %s - id: %s' % (
                            query_cache_key, str(determined_id)))
                return int(determined_id)

            # INSERT because no known id
            insert_query = 'insert into %s (%s) VALUES (\'%s\')' % (table, key, value)
            logger.info('inserting %s into %s table' % (value, table))
            try:
                results = self.mysql_insert(insert_query)
            except:
                logger.error(traceback.format_exc())
                logger.error('error :: failed to determine the id of %s from the insert' % (value))
                raise

            determined_id = 0
            if results:
                determined_id = int(results)
            else:
                logger.error('error :: results not set')
                raise

            if determined_id > 0:
                # Set the key for a week
                if not redis_determined_id:
                    try:
                        self.redis_conn.setex(query_cache_key, 604800, packb(determined_id))
                        logger.info('set redis query_cache_key - %s - id: %s' % (
                            query_cache_key, str(determined_id)))
                    except Exception as e:
                        logger.error(traceback.format_exc())
                        logger.error('%s' % str(e))
                        logger.error('error :: failed to set query_cache_key - %s - id: %s' % (
                            query_cache_key, str(determined_id)))
                return determined_id

            logger.error('error :: failed to determine the inserted id for %s' % value)
            return False

        try:
            added_by_host_id = determine_id('hosts', 'host', added_by)
        except:
            logger.error('error :: failed to determine id of %s' % (added_by))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return False

        try:
            app_id = determine_id('apps', 'app', app)
        except:
            logger.error('error :: failed to determine id of %s' % (app))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return False

        try:
            source_id = determine_id('sources', 'source', source)
        except:
            logger.error('error :: failed to determine id of %s' % (source))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return False

        try:
            metric_id = determine_id('metrics', 'metric', metric)
        except:
            logger.error('error :: failed to determine id of %s' % (metric))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return False

        algorithms_ids_csv = ''
        for algorithm in algorithms:
            try:
                algorithm_id = determine_id('algorithms', 'algorithm', algorithm)
            except:
                logger.error('error :: failed to determine id of %s' % (algorithm))
                fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
                return False
            if algorithms_ids_csv == '':
                algorithms_ids_csv = str(algorithm_id)
            else:
                new_algorithms_ids_csv = '%s,%s' % (algorithms_ids_csv, str(algorithm_id))
                algorithms_ids_csv = new_algorithms_ids_csv

        triggered_algorithms_ids_csv = ''
        for triggered_algorithm in triggered_algorithms:
            try:
                triggered_algorithm_id = determine_id('algorithms', 'algorithm', triggered_algorithm)
            except:
                logger.error('error :: failed to determine id of %s' % (triggered_algorithm))
                fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
                return False
            if triggered_algorithms_ids_csv == '':
                triggered_algorithms_ids_csv = str(triggered_algorithm_id)
            else:
                new_triggered_algorithms_ids_csv = '%s,%s' % (
                    triggered_algorithms_ids_csv, str(triggered_algorithm_id))
                triggered_algorithms_ids_csv = new_triggered_algorithms_ids_csv

        logger.info('inserting anomaly')
        try:
            full_duration = int(metric_timestamp) - int(from_timestamp)
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: full_duration - %s' % str(full_duration))
        except:
            logger.error('error :: failed to determine full_duration')
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return False

        try:
            anomalous_datapoint = round(float(value), 6)
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: anomalous_datapoint - %s' % str(anomalous_datapoint))
        except:
            logger.error('error :: failed to determine anomalous_datapoint')
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return False

        try:
            columns = '%s, %s, %s, %s, %s, %s, %s, %s, %s' % (
                'metric_id', 'host_id', 'app_id', 'source_id',
                'anomaly_timestamp', 'anomalous_datapoint', 'full_duration',
                'algorithms_run', 'triggered_algorithms')
            if settings.ENABLE_PANORAMA_DEBUG:
                logger.info('debug :: columns - %s' % str(columns))
        except:
            logger.error('error :: failed to construct columns string')
            logger.info(traceback.format_exc())
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return False

        try:
            query = 'insert into anomalies (%s) VALUES (%d, %d, %d, %d, %s, %.6f, %d, \'%s\', \'%s\')' % (
                columns, metric_id, added_by_host_id, app_id, source_id,
                metric_timestamp, anomalous_datapoint, full_duration,
                algorithms_ids_csv, triggered_algorithms_ids_csv)
        except:
            logger.error('error :: failed to construct insert query')
            logger.info(traceback.format_exc())
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return False

        if settings.ENABLE_PANORAMA_DEBUG:
            logger.info('debug :: anomaly insert - %s' % str(query))

        try:
            anomaly_id = self.mysql_insert(query)
            logger.info('anomaly id - %d - created for %s at %s' % (
                anomaly_id, metric, metric_timestamp))
        except:
            logger.error('error :: failed to insert anomaly %s at %s' % (
                anomaly_id, metric, metric_timestamp))
            fail_check(skyline_app, metric_failed_check_dir, str(metric_check_file))
            return False

        # Set anomaly record cache key
        try:
            self.redis_conn.setex(
                cache_key, settings.PANORAMA_EXPIRY_TIME, packb(value))
            logger.info('set cache_key - %s.last_check.%s.%s - %s' % (
                skyline_app, app, metric, str(settings.PANORAMA_EXPIRY_TIME)))
        except Exception as e:
            logger.error(
                'error :: could not query cache_key - %s.last_check.%s.%s - %s' % (
                    skyline_app, app, metric, e))

        if os.path.isfile(str(metric_check_file)):
            try:
                os.remove(str(metric_check_file))
                logger.info('metric_check_file removed - %s' % str(metric_check_file))
            except OSError:
                pass

        return anomaly_id
Ejemplo n.º 6
0
    def spin_process(self, i, run_timestamp):
        """
        Assign a metric for a process to analyze.
        """

        # Discover metric to analyze
        metric_var_files = [
            f for f in listdir(settings.MIRAGE_CHECK_PATH)
            if isfile(join(settings.MIRAGE_CHECK_PATH, f))
        ]

        # Check if this process is unnecessary
        if len(metric_var_files) == 0:
            return

        metric_var_files_sorted = sorted(metric_var_files)
        metric_check_file = '%s/%s' % (settings.MIRAGE_CHECK_PATH,
                                       str(metric_var_files_sorted[0]))

        # Load metric variables
        # @modified 20160822 - Bug #1460: panorama check file fails
        # Changed to panorama style skyline_functions load_metric_vars
        # self.load_metric_vars(metric_check_file)
        # Load and validate metric variables
        try:
            metric_vars = load_metric_vars(skyline_app, str(metric_check_file))
        except:
            logger.info(traceback.format_exc())
            logger.error(
                'error :: failed to load metric variables from check file - %s'
                % (metric_check_file))
            fail_check(skyline_app, metric_failed_check_dir,
                       str(metric_check_file))
            return

        # Test metric variables
        if len(metric_vars.metric) == 0:
            return
        else:
            metric = metric_vars.metric
            metric_name = ['metric_name', metric_vars.metric]
            self.metric_variables.append(metric_name)
        if len(metric_vars.value) == 0:
            return
        else:
            metric_value = ['metric_value', metric_vars.value]
            self.metric_variables.append(metric_value)
        if len(metric_vars.hours_to_resolve) == 0:
            return
        else:
            hours_to_resolve = [
                'hours_to_resolve', metric_vars.hours_to_resolve
            ]
            self.metric_variables.append(hours_to_resolve)
        if len(metric_vars.metric_timestamp) == 0:
            return
        else:
            metric_timestamp = [
                'metric_timestamp', metric_vars.metric_timestamp
            ]
            self.metric_variables.append(metric_timestamp)

        # Ignore any metric check with a timestamp greater than 10 minutes ago
        int_metric_timestamp = int(metric_vars.metric_timestamp)
        int_run_timestamp = int(run_timestamp)
        metric_timestamp_age = int_run_timestamp - int_metric_timestamp
        if metric_timestamp_age > settings.MIRAGE_STALE_SECONDS:
            logger.info(
                'stale check :: %s check request is %s seconds old - discarding'
                % (metric_vars.metric, metric_timestamp_age))
            # Remove metric check file
            #            try:
            #                os.remove(metric_check_file)
            #            except OSError:
            #                pass
            #            return
            if os.path.exists(metric_check_file):
                os.remove(metric_check_file)
                logger.info('removed %s' % (metric_check_file))
            else:
                logger.info('could not remove %s' % (metric_check_file))

        # Calculate hours second order resolution to seconds
        second_order_resolution_seconds = int(
            metric_vars.hours_to_resolve) * 3600

        # Calculate graphite from and until parameters from the metric timestamp
        graphite_until = datetime.datetime.fromtimestamp(
            int(metric_vars.metric_timestamp)).strftime('%H:%M_%Y%m%d')
        int_second_order_resolution_seconds = int(
            second_order_resolution_seconds)
        second_resolution_timestamp = int_metric_timestamp - int_second_order_resolution_seconds
        graphite_from = datetime.datetime.fromtimestamp(
            int(second_resolution_timestamp)).strftime('%H:%M_%Y%m%d')

        # Remove any old json file related to the metric
        metric_json_file = '%s/%s/%s.json' % (settings.MIRAGE_DATA_FOLDER,
                                              str(metric_vars.metric),
                                              str(metric_vars.metric))
        try:
            os.remove(metric_json_file)
        except OSError:
            pass

        # Get data from graphite
        logger.info(
            'retrieve data :: surfacing %s timeseries from graphite for %s seconds'
            % (metric_vars.metric, second_order_resolution_seconds))
        self.surface_graphite_metric_data(metric_vars.metric, graphite_from,
                                          graphite_until)

        # Check there is a json timeseries file to test
        if not os.path.isfile(metric_json_file):
            logger.error(
                'error :: retrieve failed - failed to surface %s timeseries from graphite'
                % (metric_vars.metric))
            # Remove metric check file
            try:
                os.remove(metric_check_file)
            except OSError:
                pass
            return
        else:
            logger.info('retrieved data :: for %s at %s seconds' %
                        (metric_vars.metric, second_order_resolution_seconds))

        # Make process-specific dicts
        exceptions = defaultdict(int)
        anomaly_breakdown = defaultdict(int)

        self.check_if_parent_is_alive()

        with open((metric_json_file), 'r') as f:
            timeseries = json.loads(f.read())
            logger.info('data points surfaced :: %s' % (len(timeseries)))

        try:
            logger.info('analyzing :: %s at %s seconds' %
                        (metric_vars.metric, second_order_resolution_seconds))
            anomalous, ensemble, datapoint = run_selected_algorithm(
                timeseries, metric_vars.metric,
                second_order_resolution_seconds)

            # If it's anomalous, add it to list
            if anomalous:
                base_name = metric.replace(settings.FULL_NAMESPACE, '', 1)
                anomalous_metric = [datapoint, base_name]
                self.anomalous_metrics.append(anomalous_metric)
                logger.info('anomaly detected  :: %s with %s' %
                            (metric_vars.metric, metric_vars.value))
                # It runs so fast, this allows us to process 30 anomalies/min
                sleep(2)

                # Get the anomaly breakdown - who returned True?
                triggered_algorithms = []
                for index, value in enumerate(ensemble):
                    if value:
                        algorithm = settings.MIRAGE_ALGORITHMS[index]
                        anomaly_breakdown[algorithm] += 1
                        triggered_algorithms.append(algorithm)

                # If Crucible or Panorama are enabled determine details
                determine_anomaly_details = False
                if settings.ENABLE_CRUCIBLE and settings.MIRAGE_CRUCIBLE_ENABLED:
                    determine_anomaly_details = True
                if settings.PANORAMA_ENABLED:
                    determine_anomaly_details = True

                if determine_anomaly_details:
                    metric_timestamp = str(int(timeseries[-1][0]))
                    from_timestamp = str(int(timeseries[1][0]))
                    timeseries_dir = base_name.replace('.', '/')

                # If Panorama is enabled - create a Panorama check
                if settings.PANORAMA_ENABLED:
                    if not os.path.exists(settings.PANORAMA_CHECK_PATH):
                        if python_version == 2:
                            mode_arg = int('0755')
                        if python_version == 3:
                            mode_arg = mode = 0o755
                        os.makedirs(settings.PANORAMA_CHECK_PATH, mode_arg)

                    # Note:
                    # The values are enclosed is single quoted intentionally
                    # as the imp.load_source used results in a shift in the
                    # decimal position when double quoted, e.g.
                    # value = "5622.0" gets imported as
                    # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2
                    # single quoting results in the desired,
                    # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0
                    added_at = str(int(time()))
                    source = 'graphite'
                    panaroma_anomaly_data = 'metric = \'%s\'\n' \
                                            'value = \'%s\'\n' \
                                            'from_timestamp = \'%s\'\n' \
                                            'metric_timestamp = \'%s\'\n' \
                                            'algorithms = %s\n' \
                                            'triggered_algorithms = %s\n' \
                                            'app = \'%s\'\n' \
                                            'source = \'%s\'\n' \
                                            'added_by = \'%s\'\n' \
                                            'added_at = \'%s\'\n' \
                        % (base_name, str(datapoint), from_timestamp,
                           metric_timestamp, str(settings.MIRAGE_ALGORITHMS),
                           triggered_algorithms, skyline_app, source,
                           this_host, added_at)

                    # Create an anomaly file with details about the anomaly
                    panaroma_anomaly_file = '%s/%s.%s.txt' % (
                        settings.PANORAMA_CHECK_PATH, added_at, base_name)
                    try:
                        write_data_to_file(skyline_app, panaroma_anomaly_file,
                                           'w', panaroma_anomaly_data)
                        logger.info('added panorama anomaly file :: %s' %
                                    (panaroma_anomaly_file))
                    except:
                        logger.error(
                            'error :: failed to add panorama anomaly file :: %s'
                            % (panaroma_anomaly_file))
                        logger.info(traceback.format_exc())

                # If crucible is enabled - save timeseries and create a
                # crucible check
                if settings.ENABLE_CRUCIBLE and settings.MIRAGE_CRUCIBLE_ENABLED:
                    metric_timestamp = str(int(timeseries[-1][0]))
                    from_timestamp = str(int(timeseries[1][0]))
                    timeseries_dir = base_name.replace('.', '/')
                    crucible_anomaly_dir = settings.CRUCIBLE_DATA_FOLDER + '/' + timeseries_dir + '/' + metric_timestamp
                    if not os.path.exists(crucible_anomaly_dir):
                        if python_version == 2:
                            mode_arg = int('0755')
                        if python_version == 3:
                            mode_arg = mode = 0o755
                        os.makedirs(crucible_anomaly_dir, mode_arg)

                    # Note:
                    # The value is enclosed is single quoted intentionally
                    # as the imp.load_source used in crucible results in a
                    # shift in the decimal position when double quoted, e.g.
                    # value = "5622.0" gets imported as
                    # 2016-03-02 12:53:26 :: 28569 :: metric variable - value - 562.2
                    # single quoting results in the desired,
                    # 2016-03-02 13:16:17 :: 1515 :: metric variable - value - 5622.0

                    crucible_anomaly_data = 'metric = \'%s\'\n' \
                                            'value = \'%s\'\n' \
                                            'from_timestamp = \'%s\'\n' \
                                            'metric_timestamp = \'%s\'\n' \
                                            'algorithms = %s\n' \
                                            'triggered_algorithms = %s\n' \
                                            'anomaly_dir = \'%s\'\n' \
                                            'graphite_metric = True\n' \
                                            'run_crucible_tests = False\n' \
                                            'added_by = \'%s\'\n' \
                                            'added_at = \'%s\'\n' \
                        % (base_name, str(datapoint), from_timestamp,
                           metric_timestamp, str(settings.MIRAGE_ALGORITHMS),
                           triggered_algorithms, crucible_anomaly_dir,
                           skyline_app, metric_timestamp)

                    # Create an anomaly file with details about the anomaly
                    crucible_anomaly_file = '%s/%s.txt' % (
                        crucible_anomaly_dir, base_name)
                    try:
                        write_data_to_file(skyline_app, crucible_anomaly_file,
                                           'w', crucible_anomaly_data)
                        logger.info('added crucible anomaly file :: %s' %
                                    (crucible_anomaly_file))
                    except:
                        logger.error(
                            'error :: failed to add crucible anomaly file :: %s'
                            % (crucible_anomaly_file))
                        logger.info(traceback.format_exc())

                    # Create timeseries json file with the timeseries
                    json_file = '%s/%s.json' % (crucible_anomaly_dir,
                                                base_name)
                    timeseries_json = str(timeseries).replace('[',
                                                              '(').replace(
                                                                  ']', ')')
                    try:
                        write_data_to_file(skyline_app, json_file, 'w',
                                           timeseries_json)
                        logger.info('added crucible timeseries file :: %s' %
                                    (json_file))
                    except:
                        logger.error(
                            'error :: failed to add crucible timeseries file :: %s'
                            % (json_file))
                        logger.info(traceback.format_exc())

                    # Create a crucible check file
                    crucible_check_file = '%s/%s.%s.txt' % (
                        settings.CRUCIBLE_CHECK_PATH, metric_timestamp,
                        base_name)
                    try:
                        write_data_to_file(skyline_app, crucible_check_file,
                                           'w', crucible_anomaly_data)
                        logger.info('added crucible check :: %s,%s' %
                                    (base_name, metric_timestamp))
                    except:
                        logger.error(
                            'error :: failed to add crucible check file :: %s'
                            % (crucible_check_file))
                        logger.info(traceback.format_exc())
            else:
                base_name = metric.replace(settings.FULL_NAMESPACE, '', 1)
                not_anomalous_metric = [datapoint, base_name]
                self.not_anomalous_metrics.append(not_anomalous_metric)
                logger.info('not anomalous     :: %s with %s' %
                            (metric_vars.metric, metric_vars.value))

        # It could have been deleted by the Roomba
        except TypeError:
            exceptions['DeletedByRoomba'] += 1
            logger.info('exceptions        :: DeletedByRoomba')
        except TooShort:
            exceptions['TooShort'] += 1
            logger.info('exceptions        :: TooShort')
        except Stale:
            exceptions['Stale'] += 1
            logger.info('exceptions        :: Stale')
        except Boring:
            exceptions['Boring'] += 1
            logger.info('exceptions        :: Boring')
        except:
            exceptions['Other'] += 1
            logger.info('exceptions        :: Other')
            logger.info(traceback.format_exc())

        # Add values to the queue so the parent process can collate
        for key, value in anomaly_breakdown.items():
            self.mirage_anomaly_breakdown_q.put((key, value))

        for key, value in exceptions.items():
            self.mirage_exceptions_q.put((key, value))

        metric_var_files = []
        timeseries = []

        # Remove metric check file
        try:
            os.remove(metric_check_file)
        except OSError:
            pass