def test_infinite_cleanup_loop(self, mocked_apply_async): """ There is a potential for the cleanup task to constantly call itself if every time it re-runs there is at least 1 new object to clean up (i.e. every 3 seconds for 60 days a new result is recorded). Make sure it only re-calls itself if the whole batch is used. """ with self.settings(CELERY_ALWAYS_EAGER=False): initial_results = StatusCheckResult.objects.all().count() for i in range(2): StatusCheckResult(status_check=self.graphite_check, time=timezone.now() - timedelta(days=61), time_complete=timezone.now() - timedelta(days=61), succeeded=False).save() tasks.clean_db(batch_size=2) # If full batch is cleaned it should queue itself again self.assertTrue(mocked_apply_async.called) StatusCheckResult(status_check=self.graphite_check, time=timezone.now() - timedelta(days=61), time_complete=timezone.now() - timedelta(days=61), succeeded=False).save() mocked_apply_async.reset_mock() tasks.clean_db(batch_size=2) # This time full batch isn't cleaned (only 1 out of 2) - don't call again self.assertFalse(mocked_apply_async.called)
def create_dummy_data(self): self.username = '******' self.password = '******' self.user = User.objects.create(username=self.username) self.user.set_password(self.password) self.user.user_permissions.add( Permission.objects.get(codename='add_instance'), Permission.objects.get(codename='add_service'), Permission.objects.get(codename='add_httpstatuscheck'), Permission.objects.get(codename='add_graphitestatuscheck'), Permission.objects.get(codename='add_jenkinsstatuscheck'), Permission.objects.get(codename='add_icmpstatuscheck'), ) self.user.save() self.graphite_check = GraphiteStatusCheck.objects.create( name='Graphite Check', metric='stats.fake.value', check_type='>', value='9.0', created_by=self.user, importance=Service.ERROR_STATUS, ) self.jenkins_check = JenkinsStatusCheck.objects.create( name='Jenkins Check', created_by=self.user, importance=Service.ERROR_STATUS, max_queued_build_time=10, ) self.http_check = HttpStatusCheck.objects.create( name='Http Check', created_by=self.user, importance=Service.CRITICAL_STATUS, endpoint='http://arachnys.com', timeout=10, status_code='200', text_match=None, ) self.service = Service.objects.create( name='Service', ) self.service.status_checks.add( self.graphite_check, self.jenkins_check, self.http_check) # failing is second most recent self.older_result = StatusCheckResult( status_check=self.graphite_check, time=timezone.now() - timedelta(seconds=60), time_complete=timezone.now() - timedelta(seconds=59), succeeded=False ) self.older_result.save() # Passing is most recent self.most_recent_result = StatusCheckResult( status_check=self.graphite_check, time=timezone.now() - timedelta(seconds=1), time_complete=timezone.now(), succeeded=True ) self.most_recent_result.save() self.graphite_check.save() # Will recalculate status
def test_match_all_in(self): tags = [ StatusCheckResultTag.objects.get_or_create(value='tag' + str(i))[0] for i in range(3) ] ack = Acknowledgement(status_check=self.http_check, match_if=Acknowledgement.MATCH_ALL_IN) ack.save() ack.tags.add(tags[0], tags[1]) now = timezone.now() result = StatusCheckResult(status_check=self.http_check, succeeded=False, time=now, time_complete=now) result.save() # no tags matches self.assertTrue(ack.matches_result(result)) # 1 matching tag matches result.tags.add(tags[0]) self.assertTrue(ack.matches_result(result)) # 1 matching, 1 not should NOT match result.tags.add(tags[2]) self.assertFalse(ack.matches_result(result))
def run_checks(self, checks, from_service_status=None): # type: (List[Tuple[StatusCheck, bool, bool]], Union[None, str]) -> None """ Simulates running the given checks with the given results, then updates the service (triggering alerts). All previous StatusCheckResults are cleared by calling this function. A check can be listed more than once. You should set up self.service.alerts before calling this. :param checks: list of (check, succeeded, acked) tuples :param from_service_status: specify the service status to transition from (service.old_overall_status), optional """ # clear any previous results StatusCheckResult.objects.all().delete() for check, succeeded, acked in checks: now = timezone.now() result = StatusCheckResult(status_check=check, time=now, time_complete=now, succeeded=succeeded) if hasattr(StatusCheckResult, 'acked'): # forwards-compatible with acks result.acked = acked result.save() check.last_run = now check.save() if from_service_status: self.service.overall_status = from_service_status self.service.update_status()
def create_dummy_data(self): self.username = '******' self.password = '******' self.user = User.objects.create(username=self.username) self.user.set_password(self.password) self.user.user_permissions.add( Permission.objects.get(codename='add_instance'), Permission.objects.get(codename='add_service'), Permission.objects.get(codename='add_httpstatuscheck'), Permission.objects.get(codename='add_graphitestatuscheck'), Permission.objects.get(codename='add_jenkinsstatuscheck'), Permission.objects.get(codename='add_icmpstatuscheck'), ) self.user.save() self.graphite_check = GraphiteStatusCheck.objects.create( name='Graphite Check', metric='stats.fake.value', check_type='>', value='9.0', created_by=self.user, importance=Service.ERROR_STATUS, ) self.jenkins_check = JenkinsStatusCheck.objects.create( name='Jenkins Check', created_by=self.user, importance=Service.ERROR_STATUS, max_queued_build_time=10, ) self.http_check = HttpStatusCheck.objects.create( name='Http Check', created_by=self.user, importance=Service.CRITICAL_STATUS, endpoint='http://arachnys.com', timeout=10, status_code='200', text_match=None, ) self.service = Service.objects.create( name='Service', ) self.service.status_checks.add( self.graphite_check, self.jenkins_check, self.http_check) # failing is second most recent self.older_result = StatusCheckResult( check=self.graphite_check, time=timezone.now() - timedelta(seconds=60), time_complete=timezone.now() - timedelta(seconds=59), succeeded=False ) self.older_result.save() # Passing is most recent self.most_recent_result = StatusCheckResult( check=self.graphite_check, time=timezone.now() - timedelta(seconds=1), time_complete=timezone.now(), succeeded=True ) self.most_recent_result.save() self.graphite_check.save() # Will recalculate status
def create_dummy_data(self): self.username = '******' self.password = '******' self.user = User.objects.create(username=self.username) self.user.set_password(self.password) self.user.user_permissions.add( Permission.objects.get(codename='add_instance'), Permission.objects.get(codename='add_service'), ) self.user.save() self.service = Service.objects.create( name='Service', ) self.port_open_check_model = StatusCheckPluginModel.objects.create(slug='port_open_check') self.chat_messenger_alert_model = AlertPluginModel.objects.create(slug='chat_messenger_alert') # Refetch User model with new chat messenger settings. self.user = User.objects.get(username='******') self.user.chat_messenger_alert_settings.nickname = "Xx__CabotMaster420__xX" self.port_open_check = StatusCheck.objects.create( name = 'Port Open Check for Service', check_plugin = StatusCheckPluginModel.objects.get(slug='port_open_check'), created_by = self.user, importance = Service.ERROR_STATUS, port = 123, address = 'ports.arachnys.com' ) self.assertEqual(self.port_open_check.get_variable('port'), 123) self.assertEqual(StatusCheck.objects.get().port, 123) self.assertEqual(StatusCheck.objects.get().address, 'ports.arachnys.com') self.port_open_check_2 = StatusCheck.objects.create( name = 'Port Open Check for Service 2', check_plugin = StatusCheckPluginModel.objects.get(slug='port_open_check'), created_by = self.user, importance = Service.ERROR_STATUS, port = 456, address = 'ports.arachnys.com' ) self.service.status_checks.add(self.port_open_check, self.port_open_check_2) # failing is second most recent self.older_result = StatusCheckResult( status_check=self.port_open_check, time=timezone.now() - timedelta(seconds=60), time_complete=timezone.now() - timedelta(seconds=59), succeeded=False ) self.older_result.save() # Passing is most recent self.most_recent_result = StatusCheckResult( status_check=self.port_open_check, time=timezone.now() - timedelta(seconds=1), time_complete=timezone.now(), succeeded=True ) self.most_recent_result.save() self.port_open_check.save() # Will recalculate status
class LocalTestCase(TestCase): def setUp(self): requests.get = Mock() requests.post = Mock() rest.TwilioRestClient = Mock() mail.send_mail = Mock() self.create_dummy_data() super(LocalTestCase, self).setUp() def create_dummy_data(self): self.username = '******' self.password = '******' self.user = User.objects.create(username=self.username) self.user.set_password(self.password) self.user.save() self.graphite_check = GraphiteStatusCheck.objects.create( name='Graphite Check', metric='stats.fake.value', check_type='>', value='9.0', created_by=self.user, importance=Service.ERROR_STATUS, ) self.jenkins_check = JenkinsStatusCheck.objects.create( name='Jenkins Check', created_by=self.user, importance=Service.ERROR_STATUS, max_queued_build_time=10, ) self.http_check = HttpStatusCheck.objects.create( name='Http Check', created_by=self.user, importance=Service.CRITICAL_STATUS, endpoint='http://arachnys.com', timeout=10, status_code='200', text_match=None, ) self.service = Service.objects.create(name='Service', ) self.service.status_checks.add(self.graphite_check, self.jenkins_check, self.http_check) # Passing is most recent self.most_recent_result = StatusCheckResult( check=self.graphite_check, time=timezone.now() - timedelta(seconds=1), time_complete=timezone.now(), succeeded=True) self.most_recent_result.save() # failing is second most recent self.older_result = StatusCheckResult( check=self.graphite_check, time=timezone.now() - timedelta(seconds=60), time_complete=timezone.now() - timedelta(seconds=59), succeeded=False) self.older_result.save() self.graphite_check.save() # Will recalculate status
def _run(self): result = StatusCheckResult(status_check=self) try: s = socket.create_connection((self.host, self.port), self.timeout) s.shutdown(socket.SHUT_RDWR) s.close() except Exception as e: result.error = u'Error occurred: %s' % (e.message, ) result.succeeded = False else: result.succeeded = True return result
def _run(self): result = StatusCheckResult(status_check=self) args = [ 'ping', '-s', str(self.packet_size), '-c', str(self.count), '-W', str(self.timeout), self.host ] try: result.raw_data = subprocess.check_output(args, stderr=subprocess.STDOUT, shell=False) r = self._parse_output(result.raw_data) if r['packet_loss'] > 0.0: raise Exception("%0.1f%% packet loss" % r['packet_loss']) elif self.max_rtt and r['rtt']['avg'] > self.max_rtt: raise Exception("Maximum average RTT reached: %s" % r['rtt']['avg']) except subprocess.CalledProcessError as e: result.succeeded = False result.error = e.output except Exception as e: result.error = u"{}, Host: {}".format(e.message, self.host) result.succeeded = False else: result.succeeded = True return result
def trigger_failing_check(self, check): StatusCheckResult(status_check=check, time=timezone.now() - timedelta(seconds=60), time_complete=timezone.now() - timedelta(seconds=59), succeeded=False).save() check.last_run = timezone.now() check.save()
def test_cleanup_simple(self): initial_results = StatusCheckResult.objects.all().count() initial_snapshots = ServiceStatusSnapshot.objects.all().count() ServiceStatusSnapshot( service=self.service, num_checks_active=1, num_checks_passing=1, num_checks_failing=1, overall_status=self.service.overall_status, time=timezone.now() - timedelta(days=61), ).save() StatusCheckResult(status_check=self.graphite_check, time=timezone.now() - timedelta(days=61), time_complete=timezone.now() - timedelta(days=61), succeeded=False).save() self.assertEqual(StatusCheckResult.objects.all().count(), initial_results + 1) tasks.clean_db() self.assertEqual(StatusCheckResult.objects.all().count(), initial_results) self.assertEqual(ServiceStatusSnapshot.objects.all().count(), initial_snapshots)
def _run(self): result = StatusCheckResult(status_check=self) try: remaining = self.ssl_expiry_datetime() - datetime.datetime.utcnow() if remaining < datetime.timedelta(days=0): raise Exception("Certificate expired %s days ago" % remaining.days) elif remaining < datetime.timedelta(days=self.days): raise Exception("Certificate expires in %s days" % remaining.days) except Exception as e: result.error = u"{} {} {}".format(e.message, self.host, self.port) result.succeeded = False else: result.succeeded = True return result
def _run(self): result = StatusCheckResult(status_check=self) try: conn = psycopg2.connect(dbname=self.dbname, user=self.dbuser, password=self.dbpassword, host=self.host, port=self.port) conn.close() except Exception as e: result.error = u'Error occurred: %s' % (e.message) result.succeeded = False else: result.succeeded = True return result
def _run(self): result = StatusCheckResult(status_check=self) try: rval = self._check() except StatusGoException as e: result.raw_data = e.raw_data result.error = u'Error occurred: {}'.format(e.message) result.succeeded = False except Exception as e: result.error = u'Error occurred: {}'.format(e) result.succeeded = False else: result.raw_data = rval result.succeeded = True return result
def _run(self): result = StatusCheckResult(status_check=self) sess = SmtpSession() conversation = [] try: sess.connect(self.host, self.port) sess.ehlo(self.helo_address) if self.sender: sess.call('MAIL FROM:', self.sender) if self.sender and self.recipient: sess.call('RCPT TO:', self.recipient) except Exception as e: result.error = u'Error occurred %s: %s' % ( e.__class__.__name__, e.message, ) result.succeeded = False if len(sess.response_codes) > 0: result.succeeded = self.expected_code == sess.response_codes[ -1] except: result.error = u'Error occurred: %s' % (sys.exc_info()[0], ) result.succeeded = False else: result.succeeded = False if len(sess.response_codes) > 0: result.succeeded = self.expected_code == sess.response_codes[ -1] finally: sess.quit() result.raw_data = "\n".join(sess.conversation) return result
def test_cleanup_batch(self): initial_results = StatusCheckResult.objects.all().count() for i in range(2): StatusCheckResult( status_check=self.graphite_check, time=timezone.now() - timedelta(days=61), time_complete=timezone.now() - timedelta(days=61), succeeded=False ).save() self.assertEqual(StatusCheckResult.objects.all().count(), initial_results + 2) tasks.clean_db(batch_size=1) self.assertEqual(StatusCheckResult.objects.all().count(), initial_results)
def _run(self): result = StatusCheckResult(status_check=self) try: s = socket.create_connection((self.host, self.port), self.timeout) except Exception as e: result.error = u'Error occurred: %s' % (e.message, ) result.succeeded = False else: # the connection was successful, refine the check by verifying # if other success criteria are satisfied (if any) # here we verify whether the check consists of sending a message to # the server or not, and whether it is a binay payload or not if self.message_to_send: if self.message_to_send_b64: self.message_to_send = self.message_to_send.decode( 'base64') s.send(self.message_to_send) # probe it further, by comparing the received response with the # expected one if self.expected_reply: if self.expected_reply_b64: self.expected_reply = self.expected_reply.decode('base64') # here we only read as many bytes as the length of the # expected response. This is done for convenience - sometimes # a server's reply can be pretty long, and if you only care # about the beginning of the message, there's no need to look # into the remaining part. For example, in the case of HTTP # what we expect is `HTTP/1.1 200 OK` and we ignore the rest of # the response received_response = s.read(len(self.expected_reply)) if received_response == self.expected_reply: result.succeeded = True else: result.error = u'Got unexpected response %r' % ( received_response, ) result.succeeded = False result.succeeded = True finally: s.shutdown(socket.SHUT_RDWR) s.close() return result
def test_print_tags(self): StatusCheckResult.objects.all().delete() StatusCheckResultTag.objects.all().delete() now = timezone.now() result = StatusCheckResult(status_check=self.http_check, time=now, time_complete=now, succeeded=False) result.save() tags = [ StatusCheckResultTag(value='tag{:03}'.format(i)) for i in range(10) ] StatusCheckResultTag.objects.bulk_create(tags) tags_list = StatusCheckResultTag.objects.all() for i in range(len(tags)): result.tags.add(tags_list[i]) self.assertEqual( result.print_tags(), 'tag000\ntag001\ntag002\ntag003\ntag004\ntag005\ntag006\ntag007\ntag008' '\ntag009')
def test_clean_orphaned_tags(self): StatusCheckResult.objects.all().delete() StatusCheckResultTag.objects.all().delete() now = timezone.now() results = [ StatusCheckResult(status_check=self.http_check, time=now, time_complete=now, succeeded=False) for _ in range(100) ] StatusCheckResult.objects.bulk_create(results) tags = [ StatusCheckResultTag(value='tag{:03}'.format(i)) for i in range(100) ] StatusCheckResultTag.objects.bulk_create(tags) results = StatusCheckResult.objects.filter( status_check=self.http_check) tags = StatusCheckResultTag.objects.all() # add tags 0-49 to first 50 results for result, tag in zip(results[:50], tags[:50]): result.tags.add(tag) # tags 50-99 should get cleaned up here tasks.clean_orphaned_tags() tags = StatusCheckResultTag.objects.order_by('value') self.assertEqual(len(tags), 50) # 50 left self.assertEqual(list(tags.values_list('value', flat=True)), [u'tag{:03}'.format(i) for i in range(50)]) # now if we delete the status check results, all tags should all get cleaned up StatusCheckResult.objects.all().delete() tasks.clean_orphaned_tags() tags = StatusCheckResultTag.objects.order_by('value') self.assertEqual(len(tags), 0)
def run_metrics_check(check): """ Run the status check. :param check: the status check :return: a StatusCheckResult containing success/failure/error information """ # Get the series data. If there was an error, return immediately. series = check.get_series() # If there was an error fetching metrics, fail if series['error'] is True: message = series.get('error_message') logger.exception('Error fetching metrics: {}: {}'.format( series.get('error_code'), message)) error = 'Error fetching metric from source: {}'.format(message) return StatusCheckResult(status_check=check, succeeded=False, error=error), [check.tag_fetch_error] # If the series is empty, apply the empty-series handler if series['data'] == []: if check.on_empty_series == defs.ON_EMPTY_SERIES_PASS: return StatusCheckResult(status_check=check, succeeded=True, error='SUCCESS: no data'), [] if check.on_empty_series == defs.ON_EMPTY_SERIES_WARN: check.importance = Service.WARNING_STATUS tags = [check.tag_no_data] return StatusCheckResult(status_check=check, succeeded=False, error='WARNING: no data'), tags if check.on_empty_series == defs.ON_EMPTY_SERIES_FAIL: check.importance = check.high_alert_importance tags = [check.tag_no_data] return StatusCheckResult(status_check=check, succeeded=False, error='{}: no data'.format( check.importance)), tags # Ignore all checks before the following start time start_time = time.time() - check.time_range * 60 def filter_old_points(p): timestamp = p[0] if timestamp <= start_time: logger.debug('Ignoring point {} older than {}'.format( str(p), str(start_time))) return False return True parsed_series = series['data'] logger.info('Processing series {}'.format(str(parsed_series))) # order is important - most severe first, since we report the first error found thresholds = [ (check.high_alert_importance, check.high_alert_value), (Service.WARNING_STATUS, check.warning_value), ] # Process each series, updating result and tags as we go result = StatusCheckResult(status_check=check, succeeded=True) result.raw_data = _get_raw_data_with_thresholds(check, series) tags = [] # loop order is: # (high_importance, series_1), (high_importance, series_2), ..., # (warning, series_1), (warning, series_2), ... # and we report the first error encountered as our error # (but continue looping so we accumulate tags) for importance, threshold in thresholds: for series_data in parsed_series: series_name = series_data['series'] datapoints = list( filter(filter_old_points, series_data['datapoints'])) failing_point = _point_triggering_alert(datapoints, check.check_type, check.consecutive_failures, threshold) if failing_point is not None: tags.append(check.tag_failing(importance, series_name)) if result.succeeded: # record the first, most severe failure result.succeeded = False check.importance = importance result.error = _get_error_message(check, threshold, importance, series_name, failing_point[1]) logger.info('Finished processing series {}'.format(series_name)) return result, tags
def _run(self): if not hasattr(self, 'utcnow'): self.utcnow = None result = StatusCheckResult(status_check=self) # NOTE: Can be added later # last_result = self.last_result() # # if last_result: # last_result_started = last_result.time # time_to_check = max(self.frequency, ((timezone.now() - last_result_started).total_seconds() / 60) + 1) # else: # time_to_check = self.frequency output = self.parse_metric() result.raw_data = output["raw"] # Check if the metric condition if output["error"]: result.error = output["error"] result.succeeded = False return result if not output["num_series_with_data"]: result.error = "Empty result for given metric" result.succeeded = False return result failures = [] failure_value = None if output['num_series_with_data'] > 0: result.average_value = output['average_value'] for s in output['series']: if not s["values"]: continue failure_value = None if self.check_type == '<': if float(s["min"]) < float(self.value): failure_value = s["min"] elif self.check_type == '<=': if float(s["min"]) <= float(self.value): failure_value = s["min"] elif self.check_type == '>': if float(s["max"]) > float(self.value): failure_value = s["max"] elif self.check_type == '>=': if float(s["max"]) >= float(self.value): failure_value = s["max"] elif self.check_type == '==': if float(self.value) in s['values']: failure_value = float(self.value) else: raise Exception(u'Check type %s not supported' % self.check_type) if failure_value: failures.append(failure_value) if len(failures) > self.allowed_num_failures: result.succeeded = False elif output['num_series_with_data'] < self.expected_num_hosts: result.succeeded = False else: result.succeeded = True if not result.succeeded: # targets = [s["target"] for s in output["series"]] # hosts = minimize_targets(targets) # hosts_by_target = dict(zip(targets, hosts)) result.error = self.format_error_message( failures, output['num_series_with_data'] ) return result
class LocalTestCase(TestCase): def setUp(self): requests.get = Mock() requests.post = Mock() rest.TwilioRestClient = Mock() mail.send_mail = Mock() self.create_dummy_data() super(LocalTestCase, self).setUp() def create_dummy_data(self): self.username = '******' self.password = '******' self.user = User.objects.create(username=self.username) self.user.set_password(self.password) self.user.save() self.graphite_check = GraphiteStatusCheck.objects.create( name='Graphite Check', metric='stats.fake.value', check_type='>', value='9.0', created_by=self.user, importance=Service.ERROR_STATUS, ) self.jenkins_check = JenkinsStatusCheck.objects.create( name='Jenkins Check', created_by=self.user, importance=Service.ERROR_STATUS, max_queued_build_time=10, ) self.http_check = HttpStatusCheck.objects.create( name='Http Check', created_by=self.user, importance=Service.CRITICAL_STATUS, endpoint='http://arachnys.com', timeout=10, status_code='200', text_match=None, ) self.service = Service.objects.create( name='Service', ) self.service.status_checks.add( self.graphite_check, self.jenkins_check, self.http_check) # Passing is most recent self.most_recent_result = StatusCheckResult( check=self.graphite_check, time=timezone.now() - timedelta(seconds=1), time_complete=timezone.now(), succeeded=True ) self.most_recent_result.save() # failing is second most recent self.older_result = StatusCheckResult( check=self.graphite_check, time=timezone.now() - timedelta(seconds=60), time_complete=timezone.now() - timedelta(seconds=59), succeeded=False ) self.older_result.save() self.graphite_check.save() # Will recalculate status
def create_dummy_data(self): self.username = '******' self.password = '******' self.user = User.objects.create(username=self.username) self.user.set_password(self.password) self.user.user_permissions.add( Permission.objects.get(codename='add_service'), Permission.objects.get(codename='add_httpstatuscheck'), Permission.objects.get(codename='add_jenkinsstatuscheck'), Permission.objects.get(codename='add_tcpstatuscheck'), ) self.user.save() self.jenkins_check = JenkinsStatusCheck.objects.create( id=10101, name='Jenkins Check', created_by=self.user, importance=Service.ERROR_STATUS, max_queued_build_time=10, max_build_failures=5) self.jenkins_check2 = JenkinsStatusCheck.objects.create( id=10104, name='Jenkins Check 2', created_by=self.user, importance=Service.ERROR_STATUS, max_queued_build_time=10, max_build_failures=0) self.http_check = HttpStatusCheck.objects.create( id=10102, name='Http Check', created_by=self.user, importance=Service.CRITICAL_STATUS, endpoint='http://arachnys.com', timeout=10, status_code='200', text_match=None, ) self.tcp_check = TCPStatusCheck.objects.create( id=10103, name='TCP Check', created_by=self.user, importance=Service.ERROR_STATUS, address='github.com', port=80, timeout=6, ) # Set ical_url for schedule to filename we're using for mock response self.schedule = Schedule.objects.create( name='Principal', ical_url='calendar_response.ics', ) self.secondary_schedule = Schedule.objects.create( name='Secondary', ical_url='calendar_response_different.ics', fallback_officer=self.user, ) self.schedule.save() self.secondary_schedule.save() self.service = Service.objects.create( id=2194, name='Service', ) self.service.save() self.service.schedules.add(self.schedule) self.service.status_checks.add(self.jenkins_check, self.http_check, self.tcp_check) # Failing is second most recent self.older_result = StatusCheckResult( status_check=self.http_check, time=timezone.now() - timedelta(seconds=60), time_complete=timezone.now() - timedelta(seconds=59), succeeded=False) self.older_result.save() # Passing is most recent self.most_recent_result = StatusCheckResult( status_check=self.http_check, time=timezone.now() - timedelta(seconds=1), time_complete=timezone.now(), succeeded=True) self.most_recent_result.save() self.http_check.save() # Will recalculate status
class LocalTestCase(APITestCase): def setUp(self): requests.get = Mock() requests.post = Mock() rest.TwilioRestClient = Mock() mail.send_mail = Mock() self.create_dummy_data() super(LocalTestCase, self).setUp() def create_dummy_data(self): self.username = "******" self.password = "******" self.user = User.objects.create(username=self.username) self.user.set_password(self.password) self.user.user_permissions.add( Permission.objects.get(codename="add_instance"), Permission.objects.get(codename="add_service"), Permission.objects.get(codename="add_httpstatuscheck"), Permission.objects.get(codename="add_graphitestatuscheck"), Permission.objects.get(codename="add_jenkinsstatuscheck"), Permission.objects.get(codename="add_icmpstatuscheck"), ) self.user.save() self.graphite_check = GraphiteStatusCheck.objects.create( name="Graphite Check", metric="stats.fake.value", check_type=">", value="9.0", created_by=self.user, importance=Service.ERROR_STATUS, ) self.jenkins_check = JenkinsStatusCheck.objects.create( name="Jenkins Check", created_by=self.user, importance=Service.ERROR_STATUS, max_queued_build_time=10 ) self.http_check = HttpStatusCheck.objects.create( name="Http Check", created_by=self.user, importance=Service.CRITICAL_STATUS, endpoint="http://arachnys.com", timeout=10, status_code="200", text_match=None, ) self.service = Service.objects.create(name="Service") self.service.status_checks.add(self.graphite_check, self.jenkins_check, self.http_check) # failing is second most recent self.older_result = StatusCheckResult( check=self.graphite_check, time=timezone.now() - timedelta(seconds=60), time_complete=timezone.now() - timedelta(seconds=59), succeeded=False, ) self.older_result.save() # Passing is most recent self.most_recent_result = StatusCheckResult( check=self.graphite_check, time=timezone.now() - timedelta(seconds=1), time_complete=timezone.now(), succeeded=True, ) self.most_recent_result.save() self.graphite_check.save() # Will recalculate status
def _run(self): result = StatusCheckResult(status_check=self) try: client = get_boto_client(self.cloudwatch_config) except Exception as e: result.succeeded = False result.error = u"Couldn't create cloudwatch client: {}".format(e) return result else: namespace, metric_name = self.cloudwatch_metric.split(":") start_time = datetime.now() - timedelta(minutes=self.frequency) end_time = datetime.now() resp = client.get_metric_statistics( Namespace=namespace, MetricName=metric_name, Dimensions=self.parsed_dimensions(), StartTime=start_time, EndTime=end_time, Period=60, Statistics=['SampleCount','Average','Sum','Minimum','Maximum',], ) if len(resp['Datapoints']) == 0: result.succeeded = False result.error = u"No datapoints" return result failures = [] stats = [dp[self.statistic] for dp in resp['Datapoints']] for stat in stats: failure_value = None if self.check_type == '<': if stat < float(self.value): failure_value = stat elif self.check_type == '<=': if stat <= float(self.value): failure_value = stat elif self.check_type == '>': if stat > float(self.value): failure_value = stat elif self.check_type == '>=': if stat >= float(self.value): failure_value = stat elif self.check_type == '==': if float(self.value) == stat: failure_value = float(self.value) else: raise Exception(u'Check type %s not supported' % self.check_type) if not failure_value is None: failures.append(failure_value) if len(failures) > 0: result.succeeded = False result.error = u"{} {} {}".format(failures, self.check_type, self.value) return result result.succeeded = True return result
def _run(self): result = StatusCheckResult(status_check=self) try: self.checkIfMonitorIdExists() monitorResponse = self.findMonitor() if (monitorResponse.status_code == 401): result.error = u"Cant find monitor process {} with id: {}. Probably it was deleted.".format(self.monitor_name,self.monitor_id) result.succeeded = False result.raw_data = '401 UNAUTHORIZED' return result if (monitorResponse.status_code == 404): result.error = u"Cant find monitor process {} with id: {}. Probably it was deleted.".format(self.monitor_name,self.monitor_id) result.succeeded = False result.raw_data = '404 NOT FOUND' return result if (monitorResponse.status_code == 200): monitorData = monitorResponse.json().get('monitorDetails') if (monitorData.get('isDown')): result.error = u"Monitor process {} is down! Please checkin using URL: {}".format(self.monitor_name,self.monitor_checkin) result.succeeded = False result.raw_data = self.buildRawData(monitorData) return result else: result.succeeded = True result.error = 'None' result.raw_data = 'Monitor is alive!' return result result.succeeded = True result.error = 'Unexpected response!' result.raw_data = u'Response code is: {}'.format(monitorResponse.status_code) return result except Exception as e: result.error = e.args result.succeeded = False result.raw_data = e.args return result