def _run(self): result = StatusCheckResult(status_check=self) args = [ 'ping', '-s', str(self.packet_size), '-c', str(self.count), '-W', str(self.timeout), self.host ] try: result.raw_data = subprocess.check_output(args, stderr=subprocess.STDOUT, shell=False) r = self._parse_output(result.raw_data) if r['packet_loss'] > 0.0: raise Exception("%0.1f%% packet loss" % r['packet_loss']) elif self.max_rtt and r['rtt']['avg'] > self.max_rtt: raise Exception("Maximum average RTT reached: %s" % r['rtt']['avg']) except subprocess.CalledProcessError as e: result.succeeded = False result.error = e.output except Exception as e: result.error = u"{}, Host: {}".format(e.message, self.host) result.succeeded = False else: result.succeeded = True return result
def _run(self): result = StatusCheckResult(status_check=self) try: s = socket.create_connection((self.host, self.port), self.timeout) s.shutdown(socket.SHUT_RDWR) s.close() except Exception as e: result.error = u'Error occurred: %s' % (e.message, ) result.succeeded = False else: result.succeeded = True return result
def _run(self): result = StatusCheckResult(status_check=self) try: s = socket.create_connection((self.host, self.port), self.timeout) except Exception as e: result.error = u'Error occurred: %s' % (e.message, ) result.succeeded = False else: # the connection was successful, refine the check by verifying # if other success criteria are satisfied (if any) # here we verify whether the check consists of sending a message to # the server or not, and whether it is a binay payload or not if self.message_to_send: if self.message_to_send_b64: self.message_to_send = self.message_to_send.decode( 'base64') s.send(self.message_to_send) # probe it further, by comparing the received response with the # expected one if self.expected_reply: if self.expected_reply_b64: self.expected_reply = self.expected_reply.decode('base64') # here we only read as many bytes as the length of the # expected response. This is done for convenience - sometimes # a server's reply can be pretty long, and if you only care # about the beginning of the message, there's no need to look # into the remaining part. For example, in the case of HTTP # what we expect is `HTTP/1.1 200 OK` and we ignore the rest of # the response received_response = s.read(len(self.expected_reply)) if received_response == self.expected_reply: result.succeeded = True else: result.error = u'Got unexpected response %r' % ( received_response, ) result.succeeded = False result.succeeded = True finally: s.shutdown(socket.SHUT_RDWR) s.close() return result
def _run(self): result = StatusCheckResult(status_check=self) try: remaining = self.ssl_expiry_datetime() - datetime.datetime.utcnow() if remaining < datetime.timedelta(days=0): raise Exception("Certificate expired %s days ago" % remaining.days) elif remaining < datetime.timedelta(days=self.days): raise Exception("Certificate expires in %s days" % remaining.days) except Exception as e: result.error = u"{} {} {}".format(e.message, self.host, self.port) result.succeeded = False else: result.succeeded = True return result
def _run(self): result = StatusCheckResult(status_check=self) try: conn = psycopg2.connect(dbname=self.dbname, user=self.dbuser, password=self.dbpassword, host=self.host, port=self.port) conn.close() except Exception as e: result.error = u'Error occurred: %s' % (e.message) result.succeeded = False else: result.succeeded = True return result
def _run(self): result = StatusCheckResult(status_check=self) try: rval = self._check() except StatusGoException as e: result.raw_data = e.raw_data result.error = u'Error occurred: {}'.format(e.message) result.succeeded = False except Exception as e: result.error = u'Error occurred: {}'.format(e) result.succeeded = False else: result.raw_data = rval result.succeeded = True return result
def _run(self): result = StatusCheckResult(status_check=self) sess = SmtpSession() conversation = [] try: sess.connect(self.host, self.port) sess.ehlo(self.helo_address) if self.sender: sess.call('MAIL FROM:', self.sender) if self.sender and self.recipient: sess.call('RCPT TO:', self.recipient) except Exception as e: result.error = u'Error occurred %s: %s' % ( e.__class__.__name__, e.message, ) result.succeeded = False if len(sess.response_codes) > 0: result.succeeded = self.expected_code == sess.response_codes[ -1] except: result.error = u'Error occurred: %s' % (sys.exc_info()[0], ) result.succeeded = False else: result.succeeded = False if len(sess.response_codes) > 0: result.succeeded = self.expected_code == sess.response_codes[ -1] finally: sess.quit() result.raw_data = "\n".join(sess.conversation) return result
def _run(self): result = StatusCheckResult(status_check=self) try: self.checkIfMonitorIdExists() monitorResponse = self.findMonitor() if (monitorResponse.status_code == 401): result.error = u"Cant find monitor process {} with id: {}. Probably it was deleted.".format(self.monitor_name,self.monitor_id) result.succeeded = False result.raw_data = '401 UNAUTHORIZED' return result if (monitorResponse.status_code == 404): result.error = u"Cant find monitor process {} with id: {}. Probably it was deleted.".format(self.monitor_name,self.monitor_id) result.succeeded = False result.raw_data = '404 NOT FOUND' return result if (monitorResponse.status_code == 200): monitorData = monitorResponse.json().get('monitorDetails') if (monitorData.get('isDown')): result.error = u"Monitor process {} is down! Please checkin using URL: {}".format(self.monitor_name,self.monitor_checkin) result.succeeded = False result.raw_data = self.buildRawData(monitorData) return result else: result.succeeded = True result.error = 'None' result.raw_data = 'Monitor is alive!' return result result.succeeded = True result.error = 'Unexpected response!' result.raw_data = u'Response code is: {}'.format(monitorResponse.status_code) return result except Exception as e: result.error = e.args result.succeeded = False result.raw_data = e.args return result
def _run(self): result = StatusCheckResult(status_check=self) try: client = get_boto_client(self.cloudwatch_config) except Exception as e: result.succeeded = False result.error = u"Couldn't create cloudwatch client: {}".format(e) return result else: namespace, metric_name = self.cloudwatch_metric.split(":") start_time = datetime.now() - timedelta(minutes=self.frequency) end_time = datetime.now() resp = client.get_metric_statistics( Namespace=namespace, MetricName=metric_name, Dimensions=self.parsed_dimensions(), StartTime=start_time, EndTime=end_time, Period=60, Statistics=['SampleCount','Average','Sum','Minimum','Maximum',], ) if len(resp['Datapoints']) == 0: result.succeeded = False result.error = u"No datapoints" return result failures = [] stats = [dp[self.statistic] for dp in resp['Datapoints']] for stat in stats: failure_value = None if self.check_type == '<': if stat < float(self.value): failure_value = stat elif self.check_type == '<=': if stat <= float(self.value): failure_value = stat elif self.check_type == '>': if stat > float(self.value): failure_value = stat elif self.check_type == '>=': if stat >= float(self.value): failure_value = stat elif self.check_type == '==': if float(self.value) == stat: failure_value = float(self.value) else: raise Exception(u'Check type %s not supported' % self.check_type) if not failure_value is None: failures.append(failure_value) if len(failures) > 0: result.succeeded = False result.error = u"{} {} {}".format(failures, self.check_type, self.value) return result result.succeeded = True return result
def run_metrics_check(check): """ Run the status check. :param check: the status check :return: a StatusCheckResult containing success/failure/error information """ # Get the series data. If there was an error, return immediately. series = check.get_series() # If there was an error fetching metrics, fail if series['error'] is True: message = series.get('error_message') logger.exception('Error fetching metrics: {}: {}'.format( series.get('error_code'), message)) error = 'Error fetching metric from source: {}'.format(message) return StatusCheckResult(status_check=check, succeeded=False, error=error), [check.tag_fetch_error] # If the series is empty, apply the empty-series handler if series['data'] == []: if check.on_empty_series == defs.ON_EMPTY_SERIES_PASS: return StatusCheckResult(status_check=check, succeeded=True, error='SUCCESS: no data'), [] if check.on_empty_series == defs.ON_EMPTY_SERIES_WARN: check.importance = Service.WARNING_STATUS tags = [check.tag_no_data] return StatusCheckResult(status_check=check, succeeded=False, error='WARNING: no data'), tags if check.on_empty_series == defs.ON_EMPTY_SERIES_FAIL: check.importance = check.high_alert_importance tags = [check.tag_no_data] return StatusCheckResult(status_check=check, succeeded=False, error='{}: no data'.format( check.importance)), tags # Ignore all checks before the following start time start_time = time.time() - check.time_range * 60 def filter_old_points(p): timestamp = p[0] if timestamp <= start_time: logger.debug('Ignoring point {} older than {}'.format( str(p), str(start_time))) return False return True parsed_series = series['data'] logger.info('Processing series {}'.format(str(parsed_series))) # order is important - most severe first, since we report the first error found thresholds = [ (check.high_alert_importance, check.high_alert_value), (Service.WARNING_STATUS, check.warning_value), ] # Process each series, updating result and tags as we go result = StatusCheckResult(status_check=check, succeeded=True) result.raw_data = _get_raw_data_with_thresholds(check, series) tags = [] # loop order is: # (high_importance, series_1), (high_importance, series_2), ..., # (warning, series_1), (warning, series_2), ... # and we report the first error encountered as our error # (but continue looping so we accumulate tags) for importance, threshold in thresholds: for series_data in parsed_series: series_name = series_data['series'] datapoints = list( filter(filter_old_points, series_data['datapoints'])) failing_point = _point_triggering_alert(datapoints, check.check_type, check.consecutive_failures, threshold) if failing_point is not None: tags.append(check.tag_failing(importance, series_name)) if result.succeeded: # record the first, most severe failure result.succeeded = False check.importance = importance result.error = _get_error_message(check, threshold, importance, series_name, failing_point[1]) logger.info('Finished processing series {}'.format(series_name)) return result, tags
def _run(self): if not hasattr(self, 'utcnow'): self.utcnow = None result = StatusCheckResult(status_check=self) # NOTE: Can be added later # last_result = self.last_result() # # if last_result: # last_result_started = last_result.time # time_to_check = max(self.frequency, ((timezone.now() - last_result_started).total_seconds() / 60) + 1) # else: # time_to_check = self.frequency output = self.parse_metric() result.raw_data = output["raw"] # Check if the metric condition if output["error"]: result.error = output["error"] result.succeeded = False return result if not output["num_series_with_data"]: result.error = "Empty result for given metric" result.succeeded = False return result failures = [] failure_value = None if output['num_series_with_data'] > 0: result.average_value = output['average_value'] for s in output['series']: if not s["values"]: continue failure_value = None if self.check_type == '<': if float(s["min"]) < float(self.value): failure_value = s["min"] elif self.check_type == '<=': if float(s["min"]) <= float(self.value): failure_value = s["min"] elif self.check_type == '>': if float(s["max"]) > float(self.value): failure_value = s["max"] elif self.check_type == '>=': if float(s["max"]) >= float(self.value): failure_value = s["max"] elif self.check_type == '==': if float(self.value) in s['values']: failure_value = float(self.value) else: raise Exception(u'Check type %s not supported' % self.check_type) if failure_value: failures.append(failure_value) if len(failures) > self.allowed_num_failures: result.succeeded = False elif output['num_series_with_data'] < self.expected_num_hosts: result.succeeded = False else: result.succeeded = True if not result.succeeded: # targets = [s["target"] for s in output["series"]] # hosts = minimize_targets(targets) # hosts_by_target = dict(zip(targets, hosts)) result.error = self.format_error_message( failures, output['num_series_with_data'] ) return result