コード例 #1
0
ファイル: analyzer.py プロジェクト: scalextremeinc/skyline
class Analyzer(Thread):
    def __init__(self, parent_pid, storage):
        """
        Initialize the Analyzer
        """
        super(Analyzer, self).__init__()
        self.redis_conn = StrictRedis(unix_socket_path = settings.REDIS_SOCKET_PATH)
        self.daemon = True
        self.parent_pid = parent_pid
        self.current_pid = getpid()
        self.lock = Lock()
        self.exceptions = Manager().dict()
        self.anomaly_breakdown = Manager().dict()
        self.anomalous_metrics = Manager().list()
        self.storage = storage
        self.alerter = Alerter(storage)

    def check_if_parent_is_alive(self):
        """
        Self explanatory
        """
        try:
            kill(self.current_pid, 0)
            kill(self.parent_pid, 0)
        except:
            exit(0)

    def spin_process(self, i, unique_metrics):
        """
        Assign a bunch of metrics for a process to analyze.
        """
        # Discover assigned metrics
        keys_per_processor = int(ceil(float(len(unique_metrics)) / float(settings.ANALYZER_PROCESSES)))
        if i == settings.ANALYZER_PROCESSES:
            assigned_max = len(unique_metrics)
        else:
            assigned_max = i * keys_per_processor
        assigned_min = assigned_max - keys_per_processor
        assigned_keys = range(assigned_min, assigned_max)

        # Compile assigned metrics
        assigned_metrics = [unique_metrics[index] for index in assigned_keys]

        # Check if this process is unnecessary
        if len(assigned_metrics) == 0:
            return

        # Multi get series
        raw_assigned = self.redis_conn.mget(assigned_metrics)

        # Make process-specific dicts
        exceptions = defaultdict(int)
        anomaly_breakdown = defaultdict(int)

        # Distill timeseries strings into lists
        for i, metric_name in enumerate(assigned_metrics):
            self.check_if_parent_is_alive()

            try:
                raw_series = raw_assigned[i]
                unpacker = Unpacker(use_list = False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)

                anomalous, ensemble, datapoint, ts = run_selected_algorithm(timeseries)

                # If it's anomalous, add it to list
                if anomalous:
                    base_name = metric_name.replace(settings.FULL_NAMESPACE, '', 1)
                    metric = [datapoint, base_name, ts]
                    self.anomalous_metrics.append(metric)

                    # Get the anomaly breakdown - who returned True?
                    for index, value in enumerate(ensemble):
                        if value:
                            algorithm = settings.ALGORITHMS[index]
                            anomaly_breakdown[algorithm] += 1

            # It could have been deleted by the Roomba
            except AttributeError:
                exceptions['DeletedByRoomba'] += 1
            except TooShort:
                exceptions['TooShort'] += 1
            except Stale:
                exceptions['Stale'] += 1
            except Incomplete:
                exceptions['Incomplete'] += 1
            except Boring:
                exceptions['Boring'] += 1
            except:
                exceptions['Other'] += 1
                logger.info(traceback.format_exc())

        # Collate process-specific dicts to main dicts
        with self.lock:
            for key, value in anomaly_breakdown.items():
                if key not in self.anomaly_breakdown:
                    self.anomaly_breakdown[key] = value
                else:
        	        self.anomaly_breakdown[key] += value

            for key, value in exceptions.items():
                if key not in self.exceptions:
                    self.exceptions[key] = value
                else:
        	        self.exceptions[key] += value

    def send_mail(self, alert, metric):
        """
        Send an alert email to the appropriate recipient
        """
        msg = MIMEMultipart('alternative')
        msg['Subject'] = '[skyline alert] ' + metric[1]
        msg['From'] = settings.ALERT_SENDER
        msg['To'] = alert[1]
        link = '%s/render/?width=588&height=308&target=%s' % (settings.GRAPHITE_HOST, metric[1])
        body = 'Anomalous value: %s <br> Next alert in: %s seconds <a href="%s"><img src="%s"/></a>' % (metric[0], alert[2], link, link)
        msg.attach(MIMEText(body, 'html'))
        s = SMTP('127.0.0.1')
        s.sendmail(settings.ALERT_SENDER, alert[1], msg.as_string())
        s.quit()

    def run(self):
        """
        Called when the process intializes.
        """
        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except:
                logger.error('skyline can\'t connect to redis at socket path %s' % settings.REDIS_SOCKET_PATH)
                sleep(10)
                self.redis_conn = StrictRedis(unix_socket_path = settings.REDIS_SOCKET_PATH)
                continue

            # Discover unique metrics
            unique_metrics = list(self.redis_conn.smembers(settings.FULL_NAMESPACE + 'unique_metrics'))

            if len(unique_metrics) == 0:
                logger.info('no metrics in redis. try adding some - see README')
                sleep(10)
                continue

            # Spawn processes
            pids = []
            for i in range(1, settings.ANALYZER_PROCESSES + 1):
                if i > len(unique_metrics):
                    logger.info('WARNING: skyline is set for more cores than needed.')
                    break

                p = Process(target=self.spin_process, args=(i, unique_metrics))
                pids.append(p)
                p.start()

            # Send wait signal to zombie processes
            for p in pids:
                p.join()

            # Send alerts
            #if settings.ENABLE_ALERTS:
            #    for alert in settings.ALERTS:
            #        for metric in self.anomalous_metrics:
            #            if alert[0] in metric[1]:
            #                try:
            #                    last_alert = self.redis_conn.get('last_alert.' + metric[1])
            #                    if not last_alert:
            #                        self.redis_conn.setex('last_alert.' + metric[1], alert[2], packb(metric[0]))
            #                        self.send_mail(alert, metric)
            #                except Exception as e:
            #                    logger.error("couldn't send alert: %s" % e)

            # Write anomalous_metrics to static webapp directory
            filename = path.abspath(path.join(path.dirname( __file__ ), '..', settings.ANOMALY_DUMP))
            with open(filename, 'w') as fh:
                # Make it JSONP with a handle_data() function
                anomalous_metrics = list(self.anomalous_metrics)
                anomalous_metrics.sort(key=operator.itemgetter(1))
                fh.write('handle_data(%s)' % anomalous_metrics)
            
            # process anomalous metrics
            for metric in self.anomalous_metrics:
                try:
                    last_save_key = 'last_save.%s.%s' % (metric[1], metric[2])
                    last_save = self.redis_conn.get(last_save_key)
                    if not last_save:
                        self.redis_conn.setex(last_save_key,
                            settings.SKIP_FREQUENCY, packb(metric[0]))
                        self.storage.save(metric)
                    if settings.ENABLE_ALERTS:
                        last_alert_key = 'last_alert.' + metric[1]
                        last_alert = self.redis_conn.get(last_alert_key)
                        if not last_alert:
                            self.redis_conn.setex(last_alert_key,
                                settings.SKIP_FREQUENCY, packb(metric[0]))
                            self.alerter.add(metric)
                except Exception as e:
                    logger.error("Failed processing anomaly, pid: %s, metric: %s, error: %s",
                        getpid(), metric[1], e)
            
            # send ready alerts
            if settings.ENABLE_ALERTS:
                try:
                    self.alerter.send_alerts()
                except Exception as e:
                    logger.error("Failed sending alerts, error: %s", e)

            # Log progress
            logger.info('seconds to run    :: %.2f' % (time() - now))
            logger.info('total metrics     :: %d' % len(unique_metrics))
            logger.info('total analyzed    :: %d' % (len(unique_metrics) - sum(self.exceptions.values())))
            logger.info('total anomalies   :: %d' % len(self.anomalous_metrics))
            logger.info('exception stats   :: %s' % self.exceptions)
            logger.info('anomaly breakdown :: %s' % self.anomaly_breakdown)

            # Log to Graphite
            if settings.GRAPHITE_HOST != '':
                host = settings.GRAPHITE_HOST.replace('http://', '')
                system('echo skyline.analyzer.run_time %.2f %s | nc -w 3 %s 2003' % ((time() - now), now, host))
                system('echo skyline.analyzer.total_analyzed %d %s | nc -w 3 %s 2003' % ((len(unique_metrics) - sum(self.exceptions.values())), now, host))

            # Check canary metric
            raw_series = self.redis_conn.get(settings.FULL_NAMESPACE + settings.CANARY_METRIC)
            if raw_series is not None:
                unpacker = Unpacker(use_list = False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)
                time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600
                projected = 24 * (time() - now) / time_human

                logger.info('canary duration   :: %.2f' % time_human)
                if settings.GRAPHITE_HOST != '':
                    host = settings.GRAPHITE_HOST.replace('http://', '')
                    system('echo skyline.analyzer.duration %.2f %s | nc -w 3 %s 2003' % (time_human, now, host))
                    system('echo skyline.analyzer.projected %.2f %s | nc -w 3 %s 2003' % (projected, now, host))


            # Reset counters
            self.anomalous_metrics[:] = []
            self.exceptions = Manager().dict()
            self.anomaly_breakdown = Manager().dict()

            # Sleep if it went too fast
            if time() - now < 5:
                logger.info('sleeping due to low run time...')
                sleep(10)
コード例 #2
0
ファイル: analyzer.py プロジェクト: scalextremeinc/skyline
class Analyzer(Thread):
    def __init__(self, parent_pid, storage):
        """
        Initialize the Analyzer
        """
        super(Analyzer, self).__init__()
        self.redis_conn = StrictRedis(
            unix_socket_path=settings.REDIS_SOCKET_PATH)
        self.daemon = True
        self.parent_pid = parent_pid
        self.current_pid = getpid()
        self.lock = Lock()
        self.exceptions = Manager().dict()
        self.anomaly_breakdown = Manager().dict()
        self.anomalous_metrics = Manager().list()
        self.storage = storage
        self.alerter = Alerter(storage)

    def check_if_parent_is_alive(self):
        """
        Self explanatory
        """
        try:
            kill(self.current_pid, 0)
            kill(self.parent_pid, 0)
        except:
            exit(0)

    def spin_process(self, i, unique_metrics):
        """
        Assign a bunch of metrics for a process to analyze.
        """
        # Discover assigned metrics
        keys_per_processor = int(
            ceil(
                float(len(unique_metrics)) /
                float(settings.ANALYZER_PROCESSES)))
        if i == settings.ANALYZER_PROCESSES:
            assigned_max = len(unique_metrics)
        else:
            assigned_max = i * keys_per_processor
        assigned_min = assigned_max - keys_per_processor
        assigned_keys = range(assigned_min, assigned_max)

        # Compile assigned metrics
        assigned_metrics = [unique_metrics[index] for index in assigned_keys]

        # Check if this process is unnecessary
        if len(assigned_metrics) == 0:
            return

        # Multi get series
        raw_assigned = self.redis_conn.mget(assigned_metrics)

        # Make process-specific dicts
        exceptions = defaultdict(int)
        anomaly_breakdown = defaultdict(int)

        # Distill timeseries strings into lists
        for i, metric_name in enumerate(assigned_metrics):
            self.check_if_parent_is_alive()

            try:
                raw_series = raw_assigned[i]
                unpacker = Unpacker(use_list=False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)

                anomalous, ensemble, datapoint, ts = run_selected_algorithm(
                    timeseries)

                # If it's anomalous, add it to list
                if anomalous:
                    base_name = metric_name.replace(settings.FULL_NAMESPACE,
                                                    '', 1)
                    metric = [datapoint, base_name, ts]
                    self.anomalous_metrics.append(metric)

                    # Get the anomaly breakdown - who returned True?
                    for index, value in enumerate(ensemble):
                        if value:
                            algorithm = settings.ALGORITHMS[index]
                            anomaly_breakdown[algorithm] += 1

            # It could have been deleted by the Roomba
            except AttributeError:
                exceptions['DeletedByRoomba'] += 1
            except TooShort:
                exceptions['TooShort'] += 1
            except Stale:
                exceptions['Stale'] += 1
            except Incomplete:
                exceptions['Incomplete'] += 1
            except Boring:
                exceptions['Boring'] += 1
            except:
                exceptions['Other'] += 1
                logger.info(traceback.format_exc())

        # Collate process-specific dicts to main dicts
        with self.lock:
            for key, value in anomaly_breakdown.items():
                if key not in self.anomaly_breakdown:
                    self.anomaly_breakdown[key] = value
                else:
                    self.anomaly_breakdown[key] += value

            for key, value in exceptions.items():
                if key not in self.exceptions:
                    self.exceptions[key] = value
                else:
                    self.exceptions[key] += value

    def send_mail(self, alert, metric):
        """
        Send an alert email to the appropriate recipient
        """
        msg = MIMEMultipart('alternative')
        msg['Subject'] = '[skyline alert] ' + metric[1]
        msg['From'] = settings.ALERT_SENDER
        msg['To'] = alert[1]
        link = '%s/render/?width=588&height=308&target=%s' % (
            settings.GRAPHITE_HOST, metric[1])
        body = 'Anomalous value: %s <br> Next alert in: %s seconds <a href="%s"><img src="%s"/></a>' % (
            metric[0], alert[2], link, link)
        msg.attach(MIMEText(body, 'html'))
        s = SMTP('127.0.0.1')
        s.sendmail(settings.ALERT_SENDER, alert[1], msg.as_string())
        s.quit()

    def run(self):
        """
        Called when the process intializes.
        """
        while 1:
            now = time()

            # Make sure Redis is up
            try:
                self.redis_conn.ping()
            except:
                logger.error(
                    'skyline can\'t connect to redis at socket path %s' %
                    settings.REDIS_SOCKET_PATH)
                sleep(10)
                self.redis_conn = StrictRedis(
                    unix_socket_path=settings.REDIS_SOCKET_PATH)
                continue

            # Discover unique metrics
            unique_metrics = list(
                self.redis_conn.smembers(settings.FULL_NAMESPACE +
                                         'unique_metrics'))

            if len(unique_metrics) == 0:
                logger.info(
                    'no metrics in redis. try adding some - see README')
                sleep(10)
                continue

            # Spawn processes
            pids = []
            for i in range(1, settings.ANALYZER_PROCESSES + 1):
                if i > len(unique_metrics):
                    logger.info(
                        'WARNING: skyline is set for more cores than needed.')
                    break

                p = Process(target=self.spin_process, args=(i, unique_metrics))
                pids.append(p)
                p.start()

            # Send wait signal to zombie processes
            for p in pids:
                p.join()

            # Send alerts
            #if settings.ENABLE_ALERTS:
            #    for alert in settings.ALERTS:
            #        for metric in self.anomalous_metrics:
            #            if alert[0] in metric[1]:
            #                try:
            #                    last_alert = self.redis_conn.get('last_alert.' + metric[1])
            #                    if not last_alert:
            #                        self.redis_conn.setex('last_alert.' + metric[1], alert[2], packb(metric[0]))
            #                        self.send_mail(alert, metric)
            #                except Exception as e:
            #                    logger.error("couldn't send alert: %s" % e)

            # Write anomalous_metrics to static webapp directory
            filename = path.abspath(
                path.join(path.dirname(__file__), '..', settings.ANOMALY_DUMP))
            with open(filename, 'w') as fh:
                # Make it JSONP with a handle_data() function
                anomalous_metrics = list(self.anomalous_metrics)
                anomalous_metrics.sort(key=operator.itemgetter(1))
                fh.write('handle_data(%s)' % anomalous_metrics)

            # process anomalous metrics
            for metric in self.anomalous_metrics:
                try:
                    last_save_key = 'last_save.%s.%s' % (metric[1], metric[2])
                    last_save = self.redis_conn.get(last_save_key)
                    if not last_save:
                        self.redis_conn.setex(last_save_key,
                                              settings.SKIP_FREQUENCY,
                                              packb(metric[0]))
                        self.storage.save(metric)
                    if settings.ENABLE_ALERTS:
                        last_alert_key = 'last_alert.' + metric[1]
                        last_alert = self.redis_conn.get(last_alert_key)
                        if not last_alert:
                            self.redis_conn.setex(last_alert_key,
                                                  settings.SKIP_FREQUENCY,
                                                  packb(metric[0]))
                            self.alerter.add(metric)
                except Exception as e:
                    logger.error(
                        "Failed processing anomaly, pid: %s, metric: %s, error: %s",
                        getpid(), metric[1], e)

            # send ready alerts
            if settings.ENABLE_ALERTS:
                try:
                    self.alerter.send_alerts()
                except Exception as e:
                    logger.error("Failed sending alerts, error: %s", e)

            # Log progress
            logger.info('seconds to run    :: %.2f' % (time() - now))
            logger.info('total metrics     :: %d' % len(unique_metrics))
            logger.info('total analyzed    :: %d' %
                        (len(unique_metrics) - sum(self.exceptions.values())))
            logger.info('total anomalies   :: %d' %
                        len(self.anomalous_metrics))
            logger.info('exception stats   :: %s' % self.exceptions)
            logger.info('anomaly breakdown :: %s' % self.anomaly_breakdown)

            # Log to Graphite
            if settings.GRAPHITE_HOST != '':
                host = settings.GRAPHITE_HOST.replace('http://', '')
                system(
                    'echo skyline.analyzer.run_time %.2f %s | nc -w 3 %s 2003'
                    % ((time() - now), now, host))
                system(
                    'echo skyline.analyzer.total_analyzed %d %s | nc -w 3 %s 2003'
                    % ((len(unique_metrics) - sum(self.exceptions.values())),
                       now, host))

            # Check canary metric
            raw_series = self.redis_conn.get(settings.FULL_NAMESPACE +
                                             settings.CANARY_METRIC)
            if raw_series is not None:
                unpacker = Unpacker(use_list=False)
                unpacker.feed(raw_series)
                timeseries = list(unpacker)
                time_human = (timeseries[-1][0] - timeseries[0][0]) / 3600
                projected = 24 * (time() - now) / time_human

                logger.info('canary duration   :: %.2f' % time_human)
                if settings.GRAPHITE_HOST != '':
                    host = settings.GRAPHITE_HOST.replace('http://', '')
                    system(
                        'echo skyline.analyzer.duration %.2f %s | nc -w 3 %s 2003'
                        % (time_human, now, host))
                    system(
                        'echo skyline.analyzer.projected %.2f %s | nc -w 3 %s 2003'
                        % (projected, now, host))

            # Reset counters
            self.anomalous_metrics[:] = []
            self.exceptions = Manager().dict()
            self.anomaly_breakdown = Manager().dict()

            # Sleep if it went too fast
            if time() - now < 5:
                logger.info('sleeping due to low run time...')
                sleep(10)