def run(self, never_match_re=None): """ collect and report on all cloud-custodian Lambda errors """ print( 'Searching cloud-custodian Lambda functions for failed invocations' ) lambda_names = LambdaHealthChecker.find_matching_func_names( re.compile(r'^(custodian-|cloud-custodian-).*'), self._region_name) logger.debug('Custodian Lambda functions: %s', lambda_names) errors = False self._get_sqs_dlq() logger.debug('%d failed Lambda invocations: %s', len(self._failed_request_ids), self._failed_request_ids.keys()) for fname in lambda_names: if not self._check_function(fname, never_match_re=never_match_re): logger.info( '_check_function returned False (NOT HEALTHY) for: %s', fname) errors = True logger.debug('Sleeping %s seconds before checking next function', self.INTER_FUNC_SLEEP) sleep(self.INTER_FUNC_SLEEP) self._ack_sqs() req_ids = [ i for i in self._failed_request_ids if self._failed_request_ids[i] is None ] if len(req_ids) > 0: print("\n\n" + red('ERROR: %d failed Lambda RequestIDs could not be tied ' 'to their function names: %s' % (len(req_ids), req_ids)) + "\n\n") if errors: print('Some lambda functions had errors in the last ' '%s' % self.INVL_DESC) raise SystemExit(1) print('No Lambda functions had errors in the last ' + self.INVL_DESC)
def _check_function(self, func_name, never_match_re=None): """ Check health of one Lambda function. Print information on it to STDOUT. Return True for healthy, False if errors/failures. :param func_name: Lambda function name to check :type func_name: str :param never_match_re: Regex for logs to NEVER return, even if they match ``always_match_re``. :type never_match_re: ``re`` :return: whether the function had errors/failures :rtype: bool """ c = LambdaHealthChecker(func_name, self._region_name, logs=self._logs, cw=self._cw) req_ids = [ i for i in self._failed_request_ids if self._failed_request_ids[i] is None ] if self.ALL_ERROR_FUNCTIONS.match(func_name): logs = c.get_filtered_logs(req_ids, always_match_re=self.ALL_ERROR_LOG_RE, never_match_re=never_match_re) else: logs = c.get_filtered_logs(req_ids) metrics = c.get_cloudwatch_metric_sums() msg = [] if metrics['Invocations'] > 0: throttle_pct = (metrics['Throttles'] / metrics['Invocations']) * 100 error_pct = (metrics['Errors'] / metrics['Invocations']) * 100 else: throttle_pct = 0 error_pct = 0 if error_pct > 50: msg.append('Lambda Function Errors: %s%% (%d of %d invocations)' % (error_pct, metrics['Errors'], metrics['Invocations'])) if throttle_pct > 50: msg.append( 'Lambda Function Throttles: %s%% (%d of %d invocations)' % (throttle_pct, metrics['Throttles'], metrics['Invocations'])) if len(logs) < 1 and len(msg) == 0: print(green('%s: OK\n' % func_name)) return True print(red('%s: ERRORS' % func_name)) for m in msg: print("\t%s" % red(m)) if len(logs) < 1: return True print("\n\tLogs For Failed Invocations:\n") for req_id in logs.keys(): if req_id == 'always_match': continue events = logs[req_id] self._failed_request_ids[req_id] = func_name print("\t" + red('RequestID=%s logGroupName=%s logStreamName=%s' % (req_id, events[0]['logGroupName'], events[0]['logStreamName']))) for e in events: print("\n".join([ "\t\t%s" % line.replace("\t", ' ') for line in e['message'].split("\n") if line.strip() != '' ])) if 'always_match' in logs: print( "\t" + red('Always-Match Logs (RequestID not in DLQ, but log matches ' 'regex that we want to always alarm on)')) for e in logs['always_match']: print("\n".join([ "\t\t%s" % line.replace("\t", ' ') for line in e['message'].split("\n") if line.strip() != '' ])) print('') return False
def test_red(self): assert red('foo') == "\033[0;31mfoo\033[0m"