def process_file(self, filename): if self.is_excluded(filename): return if filename == '-': self.iostream = sys.stdin self.process_json(sys.stdin.read(), '<STDIN>') else: # reset this flag which we use to only print single quote detection once per file self.single_quotes_detected = False try: with open(filename) as self.iostream: # check if it's a Big Data format file with json doc on first line # this is more efficient than slurping a large file only to fail with out of memory for _ in range(1, 10): line = self.iostream.readline() if line: if isJson(line) or \ isJson(self.convert_single_quoted(line)) or \ isJson(self.convert_single_quoted_escaped(line)): log.debug("header line of '{0}' detected as a valid JSON document".format(filename) + ", assuming Big Data format multi-line json") self.process_multirecord_json(filename) break else: try: self.iostream.seek(0) content = self.iostream.read() self.process_json(content, filename) except MemoryError: # may be a big data format after all and perhaps the first record was broken log.warning("memory error validating contents from file '{0}', ".format(filename) + "assuming Big Data multi-record json and re-trying validation line-by-line") self.process_multirecord_json(filename) except IOError as _: die("ERROR: %s" % _)
def parse_builds(self, content): log.debug('parsing build info') build = None collected_builds = [] json_data = json.loads(content) if not json_data or \ 'builds' not in json_data or \ not json_data['builds']: qquit( 'UNKNOWN', "no Travis CI builds returned by the Travis API." + " Either the specified repo '{0}' doesn't exist".format( self.repo) + " or no builds have happened yet?" + " Also remember the repo is case sensitive, for example 'harisekhon/nagios-plugins' returns this" + " blank build set whereas 'HariSekhon/Nagios-Plugins' succeeds" + " in returning latest builds information") builds = json_data['builds'] # get latest finished failed build last_build_number = None found_newer_passing_build = False for _ in builds: # API returns most recent build first # extra check to make sure we're getting the very latest build number and API hasn't changed build_number = _['number'] if not isInt(build_number): raise UnknownError('build number returned is not an integer!') build_number = int(build_number) if last_build_number is None: last_build_number = int(build_number) + 1 if build_number >= last_build_number: raise UnknownError('build number returned is out of sequence, cannot be >= last build returned' + \ '{0}'.format(support_msg_api())) last_build_number = build_number if self.completed: if len(collected_builds) < self.num and _['state'] in ( 'passed', 'finished', 'failed', 'errored'): collected_builds.append(_) elif self.failed: if _['state'] == 'passed': if not collected_builds and not found_newer_passing_build: log.warning("found more recent successful build #%s with state = '%s'" + \ ", you may not need to debug this build any more", _['number'], _['state']) found_newer_passing_build = True elif _['state'] in ('failed', 'errored'): if len(collected_builds) < self.num: collected_builds.append(_) # by continuing to iterate through the rest of the builds we can check # their last_build numbers are descending for extra sanity checking #break elif len(collected_builds) < self.num: collected_builds.append(_) # by continuing to iterate through the rest of the builds we can check # their last_build numbers are descending for extra sanity checking #break if not collected_builds: qquit('UNKNOWN', 'no recent builds found') if log.isEnabledFor(logging.DEBUG): for build in collected_builds: log.debug("build:\n%s", jsonpp(build)) return collected_builds
def __parse_args__(self): try: (self.options, self.args) = self.__parser.parse_args() # I don't agree with zero exit code from OptionParser for help/usage, # and want UNKNOWN not CRITICAL(2) for switch mis-usage... except SystemExit: # pragma: no cover sys.exit(ERRORS['UNKNOWN']) if self.options.help: # pragma: no cover self.usage() if self.options.version: # pragma: no cover print('%(version)s' % self.__dict__) sys.exit(ERRORS['UNKNOWN']) if 'timeout' in dir(self.options): self.timeout = self.get_opt('timeout') env_verbose = os.getenv('VERBOSE') if isInt(env_verbose): if env_verbose > self.verbose: log.debug('environment variable $VERBOSE = %s, increasing verbosity', env_verbose) self.verbose = env_verbose elif env_verbose is None: pass else: log.warning("$VERBOSE environment variable is not an integer ('%s')", env_verbose) self.parse_args() return self.options, self.args
def validate_csvreader(csvreader, filename): count = 0 try: # csvreader doesn't seem to generate any errors ever :-( # csv module allows entire lines of json/xml/yaml to go in as a single field # Adding some invalidations manually for field_list in csvreader: # list of fields with no separator information log.debug("line: %s", field_list) # make it fail if there is only a single field on any line if len(field_list) < 3: log.error("less than 3 fields detected, aborting conversion of file '%s'", filename) return None # extra protection along the same lines as anti-json: # the first char of field should be alphanumeric, not syntax # however instead of isAlnum allow quotes for quoted CSVs to pass validation if field_list[0] != "" and not isChars(field_list[0][0], 'A-Za-z0-9"'): log.error('non-alphanumeric / quote opening character detected in CSV') return None count += 1 except csv.Error as _: log.warning('file %s, line %s: %s', filename, csvreader.line_num, _) return None if count == 0: log.error('zero lines detected, blank input is not valid CSV') return None return csvreader
def load_file(filename, boundary=False): log.info('loading custom regex patterns from %s', filename) regex_list = [] re_ending_pipe = re.compile(r'\|\s*$') re_leading_space = re.compile(r'^\s*') with open(filename) as filehandle: for line in filehandle: line = line.rstrip('\n') line = line.rstrip('\r') line = line.split('#')[0] line = re_ending_pipe.sub('', line) line = re_leading_space.sub('', line) if not line: continue if not isRegex(line): log.warning('ignoring invalid regex from %s: %s', os.path.basename(filename), line) continue if boundary: line = r'(\b|[^A-Za-z])' + line + r'(\b|[^A-Za-z])' regex_list.append(line) raw = '|'.join(regex_list) #log.debug('custom_raw: %s', raw) regex_list = [re.compile(_, re.I) for _ in regex_list] return (regex_list, raw)
def __parse_timeout__(self): # reset this to none otherwise unit tests fail to take setting from timeout_default # use __timeout to bypass the property setter checks self.__timeout = None if 'timeout' in dir(self.options): timeout = self.get_opt('timeout') if timeout is not None: log.debug('getting --timeout value %s', self.timeout) self.timeout = timeout if self.timeout is None: env_timeout = os.getenv('TIMEOUT') log.debug('getting $TIMEOUT value %s', env_timeout) if env_timeout is not None: log.debug('env_timeout is not None') if isInt(env_timeout): log.debug( "environment variable $TIMEOUT = '%s' and timeout not already set, setting timeout = %s", env_timeout, env_timeout) self.timeout = int(env_timeout) else: log.warning( "$TIMEOUT environment variable is not an integer ('%s')", env_timeout) if self.timeout is None: log.debug('timeout not set, using default timeout %s', self.timeout_default) self.timeout = self.timeout_default
def process_json(self, content, filename): log.debug('process_json()') if not content: log.warning("blank content passed to process_json for contents of file '%s'", filename) if isJson(content): print(json.dumps(json.loads(content))) return True elif self.permit_single_quotes: log.debug('checking for single quoted JSON') # check if it's regular single quoted JSON a la MongoDB json_single_quoted = self.convert_single_quoted(content) if self.process_json_single_quoted(json_single_quoted, filename): return True log.debug('single quoted JSON check failed, trying with pre-escaping double quotes') # check if it's single quoted JSON with double quotes that aren't escaped, # by pre-escaping them before converting single quotes to doubles for processing json_single_quoted_escaped = self.convert_single_quoted_escaped(content) if self.process_json_single_quoted(json_single_quoted_escaped, filename): log.debug("processed single quoted json with non-escaped double quotes in '%s'", filename) return True log.debug('single quoted JSON check failed even with pre-escaping any double quotes') self.failed = True log.error("invalid json detected in '%s':", filename) printerr(content) if not self.continue_on_error: sys.exit(ERRORS['CRITICAL']) return False
def __parse_verbose__(self): self.verbose += int(self.get_opt('verbose')) env_verbose = os.getenv('VERBOSE') if isInt(env_verbose): if env_verbose > self.verbose: log.debug( 'environment variable $VERBOSE = %s, increasing verbosity', env_verbose) self.verbose = int(env_verbose) elif env_verbose is None: pass else: log.warning( "$VERBOSE environment variable is not an integer ('%s')", env_verbose) if self.is_option_defined('quiet') and self.get_opt('quiet'): self.verbose = 0 elif self.verbose > 2: log.setLevel(logging.DEBUG) elif self.verbose > 1: log.setLevel(logging.INFO) elif self.verbose > 0 and self._prog[0:6] != 'check_': log.setLevel(logging.WARN) if self.options.debug: log.setLevel(logging.DEBUG) # pragma: no cover log.debug('enabling debug logging') if self.verbose < 3: self.verbose = 3
def process_result(self, result): _id = result['id'] log.info('latest build id: %s', _id) status = result['status'] log.info('status: %s', status) if not isInt(status, allow_negative=True): raise UnknownError( 'non-integer status returned by DockerHub API. {0}'.format( support_msg_api())) tag = result['dockertag_name'] log.info('tag: %s', tag) trigger = result['cause'] log.info('trigger: %s', trigger) created_date = result['created_date'] log.info('created date: %s', created_date) last_updated = result['last_updated'] log.info('last updated: %s', last_updated) created_datetime = datetime.datetime.strptime( created_date.split('.')[0], '%Y-%m-%dT%H:%M:%S') updated_datetime = datetime.datetime.strptime( last_updated.split('.')[0], '%Y-%m-%dT%H:%M:%S') build_latency_timedelta = updated_datetime - created_datetime build_latency = build_latency_timedelta.total_seconds() log.info('build latency (creation to last updated): %s', build_latency) # results in .0 floats anyway build_latency = int(build_latency) build_code = result['build_code'] build_url = 'https://hub.docker.com/r/{0}/builds/{1}'.format( self.repo, build_code) log.info('latest build URL: %s', build_url) if str(status) in self.statuses: status = self.statuses[str(status)] else: log.warning("status code '%s' not recognized! %s", status, support_msg_api()) log.warning('defaulting to assume status is an Error') status = 'Error' if status != 'Success': self.critical() self.msg += "'{repo}' last completed build status: '{status}', tag: '{tag}', build code: {build_code}"\ .format(repo=self.repo, status=status, tag=tag, build_code=build_code) if self.verbose: self.msg += ', id: {0}'.format(_id) self.msg += ', trigger: {0}'.format(trigger) self.msg += ', created date: {0}'.format(created_date) self.msg += ', last updated: {0}'.format(last_updated) self.msg += ', build_latency: {0}'.format(sec2human(build_latency)) self.msg += ', build URL: {0}'.format(build_url) self.msg += ' | build_latency={0:d}s'.format(build_latency)
def process_result(self, result): _id = result['id'] log.info('latest build id: %s', _id) status = result['status'] log.info('status: %s', status) if not isInt(status, allow_negative=True): raise UnknownError('non-integer status returned by DockerHub API. {0}'.format(support_msg_api())) tag = result['dockertag_name'] log.info('tag: %s', tag) trigger = result['cause'] log.info('trigger: %s', trigger) created_date = result['created_date'] log.info('created date: %s', created_date) last_updated = result['last_updated'] log.info('last updated: %s', last_updated) created_datetime = datetime.datetime.strptime(created_date.split('.')[0], '%Y-%m-%dT%H:%M:%S') updated_datetime = datetime.datetime.strptime(last_updated.split('.')[0], '%Y-%m-%dT%H:%M:%S') build_latency_timedelta = updated_datetime - created_datetime build_latency = build_latency_timedelta.total_seconds() log.info('build latency (creation to last updated): %s', build_latency) # results in .0 floats anyway build_latency = int(build_latency) build_code = result['build_code'] build_url = 'https://hub.docker.com/r/{0}/builds/{1}'.format(self.repo, build_code) log.info('latest build URL: %s', build_url) if str(status) in self.statuses: status = self.statuses[str(status)] else: log.warning("status code '%s' not recognized! %s", status, support_msg_api()) log.warning('defaulting to assume status is an Error') status = 'Error' if status != 'Success': self.critical() self.msg += "'{repo}' last completed build status: '{status}', tag: '{tag}', build code: {build_code}"\ .format(repo=self.repo, status=status, tag=tag, build_code=build_code) if self.verbose: self.msg += ', id: {0}'.format(_id) self.msg += ', trigger: {0}'.format(trigger) self.msg += ', created date: {0}'.format(created_date) self.msg += ', last updated: {0}'.format(last_updated) self.msg += ', build_latency: {0}'.format(sec2human(build_latency)) self.msg += ', build URL: {0}'.format(build_url) self.msg += ' | build_latency={0:d}s'.format(build_latency)
def parse_table(self, row): #log.debug(row) user = row[self.indicies['user_index']] # 'hari.sekhon' in '*****@*****.**' in kerberos if self.re_ignored_users and self.re_ignored_users.match(user): log.debug('skipping row for ignored user %s: %s', user, row) return (None, None) database = row[self.indicies['database_index']].strip() table = row[self.indicies['table_index']].strip() if not database or not table or not self.re_table.match('{}.{}'.format(database, table)): #log.info('table not found in fields for row: %s', row) operation = row[self.indicies['operation_index']] if operation in self.operations_to_ignore: return (None, None) elif operation == 'QUERY': query = row[self.indicies['sql_index']] # cheaper than re_ignore to pre-filter if query in ('GET_TABLES', 'GET_SCHEMAS', 'INVALIDATE METADATA'): return (None, None) (database, table) = self.get_db_table_from_resource(row) if database and table: pass else: log.debug('database/table not found in row: %s', row) log.debug('trying to parse: %s', query) match = self.re_select_from_table.search(query) if match: table = match.group(1) if '.' in table: (database, table) = table.split('.', 1) # could use .search but all these seem to be at beginning elif self.re_ignore.match(query): return (None, None) else: log.warning('failed to parse database/table from query: %s', query) return (None, None) else: log.debug('database/table not found in row and operation is not a query to parse: %s', row) return (None, None) if not table and not database: return (None, None) table = table.lower().strip('`') database = database.lower().strip('`') if ' ' in table: raise CriticalError('table \'{}\' has spaces - parsing error for row: {}'.format(table, row)) if ' ' in database: raise CriticalError('database \'{}\' has spaces - parsing error for row: {}'.format(database, row)) if table == 'null': raise CriticalError('table == null - parsing error for row: {}'.format(row)) return (database, table)
def process_csv(self, filehandle): csvreader = None try: if self.delimiter is not None: try: csvreader = csv.reader(filehandle, delimiter=self.delimiter, quotechar=self.quotechar) except TypeError as _: self.usage(_) else: # dialect = csv.excel dialect = csv.Sniffer().sniff(filehandle.read(1024)) # this will raise an Error if invalid dialect.strict = True filehandle.seek(0) csvreader = csv.reader(filehandle, dialect) except csv.Error as _: log.warning('file %s: %s', self.filename, _) return False count = 0 try: # csvreader doesn't seem to generate any errors ever :-( # csv module allows entire lines of json/xml/yaml to go in as a single field # Adding some invalidations manually for field_list in csvreader: # list of fields with no separator information # log.debug("line: %s", _) # make it fail if there is only a single field on any line if len(field_list) < 2: return False # it's letting JSON through :-/ if field_list[0] == '{': return False # extra protection along the same lines as anti-json: # the first char of field should be alphanumeric, not syntax # however instead of isAlnum allow quotes for quoted CSVs to pass validation if not isChars(field_list[0][0], 'A-Za-z0-9\'"'): return False count += 1 except csv.Error as _: log.warning('file %s, line %s: %s', self.filename, csvreader.line_num, _) return False if count == 0: log.debug('zero lines detected, blank input is not valid CSV') return False log.debug('%s CSV lines passed', count) return True
def output(self, row, database, table): if not self.re_table.match('{}.{}'.format(database, table)): log.warning('%s.%s does not match table regex', database, table) return #self.data[database] = self.data.get(database, {}) #self.data[database][table] = 1 if table and not database: log.info('got table but not database for row: %s', row) if database and not table: log.info('got database but not table for row: %s', row) if not table and not database: return self.csv_writer.writerow({'database': database, 'table': table}) if log.isEnabledFor(logging.DEBUG): sys.stdout.flush()
def parse_json(self, json_data): drillbits = json_data['drillbits'] online_nodes = 0 for drillbit in drillbits: if 'state' not in drillbit: raise UnknownError('state field not found, is this Apache Drill < 1.12?') if drillbit['state'] == 'ONLINE': online_nodes += 1 else: log.warning("node '%s' state = '{}'", drillbit['address']) total_nodes = len(drillbits) offline_nodes = total_nodes - online_nodes self.msg = 'Apache Drill cluster: drillbits offline = {}'.format(offline_nodes) self.check_thresholds(offline_nodes) self.msg += ', drillbits online = {}'.format(online_nodes) self.msg += ', total drillbits = {}'.format(total_nodes) self.msg += ' | drillbits_offline={}{} drillbits_online={} drillbits_total={}'\ .format(offline_nodes, self.get_perf_thresholds(), online_nodes, total_nodes)
def output(self, row, database, table): if not self.re_table.match('{}.{}'.format(database, table)): log.warning('%s.%s does not match table regex', database, table) return # instead of collecting in ram, now just post-process through sort -u # this way it is easier to see live extractions, --debug and correlate #self.data[database] = self.data.get(database, {}) #self.data[database][table] = 1 if table and not database: log.info('got table but not database for row: %s', row) if database and not table: log.info('got database but not table for row: %s', row) if not table and not database: return #self.csv_writer.writerow({'database': database, 'table': table, 'user': row[self.indicies['user_index']]}) self.csv_writer.writerow({'database': database, 'table': table}) if log.isEnabledFor(logging.DEBUG): sys.stdout.flush()
def parse_json(self, json_data): drillbits = json_data['drillbits'] online_nodes = 0 for drillbit in drillbits: if 'state' in drillbit: if drillbit['state'] == 'ONLINE': online_nodes += 1 else: log.warning("node '%s' state = '{}'", drillbit['address']) else: online_nodes += 1 self.msg = 'Apache Drill cluster: drillbits online = {}'.format(online_nodes) self.check_thresholds(online_nodes) total_nodes = len(drillbits) offline_nodes = total_nodes - online_nodes self.msg += ', drillbits offline = {}'.format(offline_nodes) self.msg += ', total drillbits = {}'.format(total_nodes) self.msg += ' | drillbits_online={}{} drillbits_offline={} drillbits_total={}'.format(online_nodes, self.get_perf_thresholds(), offline_nodes, total_nodes)
def check_multirecord_json(self): log.debug('check_multirecord_json()') normal_json = False single_quoted = False count = 0 for line in self.iostream: if isJson(line): normal_json = True # can't use self.print() here, don't want to print valid for every line of a file / stdin if self.passthru: print(line, end='') count += 1 continue elif self.permit_single_quotes and self.check_json_line_single_quoted( line): single_quoted = True if self.passthru: print(line, end='') count += 1 continue else: log.debug('invalid multirecord json') self.failed = True if not self.passthru: die(self.invalid_json_msg) return False if count == 0: log.debug( 'blank input, detected zero lines while multirecord checking') self.failed = True return False # self.multi_record_detected = True log.debug('multirecord json (all %s lines passed)', count) extra_info = '' if single_quoted: extra_info = ' single quoted' if normal_json: extra_info += ' mixed with normal json!' log.warning('mixture of normal and single quoted json detected, ' + \ 'may cause issues for data processing engines') if not self.passthru: print('{0} (multi-record format{1}, {2} records)'.format( self.valid_json_msg, extra_info, count)) return True
def get_csvreader(filename): try: filehandle = open(filename) except IOError as _: log.error(_) return None filename = os.path.basename(filename) try: dialect = csv.Sniffer().sniff(filehandle.read(1024)) # this will raise an Error if invalid dialect.strict = True filehandle.seek(0) csvreader = csv.reader(filehandle, dialect) except csv.Error as _: log.warning('file %s: %s', filename, _) return None csvreader = CrunchAccountingCsvStatementConverter.validate_csvreader(csvreader, filename) filehandle.seek(0) return csvreader