def process_build_info(self, build_info):
     displayname = build_info['displayName']
     duration = build_info['duration']
     if not isInt(duration):
         raise UnknownError('duration field returned non-integer! {0}'.format(support_msg_api()))
     duration = int(duration) / 1000
     result = build_info['result']
     timestamp = build_info['timestamp']
     if not isInt(timestamp):
         raise UnknownError('timestamp field returned non-integer! {0}'.format(support_msg_api()))
     timestamp = int(timestamp)
     building = build_info['building']
     self.msg += "build {build} status: ".format(build=displayname)
     if building:
         self.unknown()
         self.msg += 'STILL BUILDING!'
         return
     self.msg += result
     if result != 'SUCCESS':
         self.critical()
     self.msg += ', duration={duration} secs'.format(duration=duration)
     self.check_thresholds(duration)
     age = time.time() - (timestamp/1000)
     self.msg += ', age={age} secs'.format(age=sec2human(age))
     if age < 0:
         self.warning()
         self.msg += ' (< 0!)'
     if self.age and age > self.age:
         self.critical()
         self.msg += ' (> {0:d})'.format(self.age)
     self.msg += ' | build_duration={duration}s{perf_thresholds}'.format(duration=duration, \
                                                                  perf_thresholds=self.get_perf_thresholds())
 def process_build_info(self, build_info):
     displayname = build_info['displayName']
     duration = build_info['duration']
     if not isInt(duration):
         raise UnknownError('duration field returned non-integer! {0}'.format(support_msg_api()))
     duration = int(duration) / 1000
     result = build_info['result']
     timestamp = build_info['timestamp']
     if not isInt(timestamp):
         raise UnknownError('timestamp field returned non-integer! {0}'.format(support_msg_api()))
     timestamp = int(timestamp)
     building = build_info['building']
     self.msg += "build {build} status: ".format(build=displayname)
     if building:
         self.unknown()
         self.msg += 'STILL BUILDING!'
         return
     self.msg += result
     if result != 'SUCCESS':
         self.critical()
     self.msg += ', duration={duration} secs'.format(duration=duration)
     self.check_thresholds(duration)
     age = time.time() - (timestamp/1000)
     self.msg += ', age={age} secs'.format(age=sec2human(age))
     if age < 0:
         self.warning()
         self.msg += ' (< 0!)'
     if self.age and age > self.age:
         self.critical()
         self.msg += ' (> {0:d})'.format(self.age)
     self.msg += ' | build_duration={duration}s{perf_thresholds}'.format(duration=duration, \
                                                                  perf_thresholds=self.get_perf_thresholds())
Exemple #3
0
 def check_app(self, app):
     state = app['state']
     user = app['user']
     queue = app['queue']
     running_containers = app['runningContainers']
     elapsed_time = app['elapsedTime']
     assert isInt(running_containers, allow_negative=True)
     assert isInt(elapsed_time)
     running_containers = int(running_containers)
     elapsed_time = int(elapsed_time / 1000)
     self.msg = "Yarn application '{0}' state = '{1}'".format(app['name'], state)
     if state != 'RUNNING':
         self.critical()
     ##################
     # This shouldn't be used any more now using more targeted query to only return running apps
     # state = FAILED / KILLED also gets final status = FAILED KILLED, no point double printing
     if state == 'FINISHED':
         self.msg += ", final status = '{0}'".format(app['finalStatus'])
     ##################
     self.msg += ", user = '******'".format(user)
     if self.app_user is not None and self.app_user != user:
         self.critical()
         self.msg += " (expected '{0}')".format(self.app_user)
     self.msg += ", queue = '{0}'".format(queue)
     if self.queue is not None and self.queue != queue:
         self.critical()
         self.msg += " (expected '{0}')".format(self.queue)
     self.msg += ", running containers = {0}".format(running_containers)
     if self.min_containers is not None and running_containers < self.min_containers:
         self.critical()
         self.msg += " (< '{0}')".format(self.min_containers)
     self.msg += ", elapsed time = {0} secs".format(elapsed_time)
     self.check_thresholds(elapsed_time)
     return elapsed_time
Exemple #4
0
 def parse_results(self, content):
     try:
         json_dict = json.loads(content)
         if log.isEnabledFor(logging.DEBUG):
             print(jsonpp(content))
             print('=' * 80)
         # looks like syshealthok child div is only there in browser, but give syshealthspin in code
         #if soup.find('div', id='syshealthstatus').find('div', id='syshealthok'):
         if not isDict(json_dict):
             raise ValueError("non-dict returned by Attivio AIE server response (type was '{0}')"\
                              .format(type(json_dict)))
         # if this is true from warnings would ruin the more appropriate warnings check
         #if json_dict['haserrors']:
         #    self.critical()
         #    self.msg += 'errors detected, '
         nodes_down = json_dict['nodesdown']
         warnings = json_dict['warnings']
         fatals = json_dict['fatals']
         acknowledged = json_dict['acknowledged']
         if not isInt(nodes_down):
             raise ValueError(
                 'non-integer returned for nodes down count by Attivio AIE')
         if not isInt(warnings):
             raise ValueError(
                 'non-integer returned for warnings count by Attivio AIE')
         if not isInt(fatals):
             raise ValueError(
                 'non-integer returned for fatals count by Attivio AIE')
         if not isInt(acknowledged):
             raise ValueError(
                 'non-integer returned for acknowledged count by Attivio AIE'
             )
         nodes_down = int(nodes_down)
         warnings = int(warnings)
         fatals = int(fatals)
         acknowledged = int(acknowledged)
         if nodes_down > 0 or fatals > 0:
             self.critical()
         elif warnings > 0:
             self.warning()
         self.msg += '{nodes_down} nodes down, {fatals} fatals, {warnings} warnings, {acknowledged} acknowledged'\
                     .format(nodes_down=nodes_down, fatals=fatals, warnings=warnings, acknowledged=acknowledged)
         if json_dict['perfmondown']:
             self.warning()
             self.msg += ', warning: performance monitoring down'
         self.msg += ' | nodes_down={nodes_down} fatals={fatals} warnings={warnings} acknowledged={acknowledged}'\
                     .format(nodes_down=nodes_down, fatals=fatals, warnings=warnings, acknowledged=acknowledged)
     except (KeyError, ValueError) as _:
         qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\
                          .format(software=self.software,
                                  exception=type(_).__name__,
                                  error=_,
                                  support_msg=support_msg_api()))
Exemple #5
0
 def parse_json(self, json_data):
     if self.list_jobs:
         print('Jenkins Jobs:\n')
         for job in json_data['jobs']:
             print(job['name'])
         sys.exit(ERRORS['UNKNOWN'])
     if 'lastCompletedBuild' in json_data:
         last_completed_build = json_data['lastCompletedBuild']
         if not last_completed_build:
             raise WarningError(
                 "job '{job}' not built yet".format(job=self.job))
         self.path = '/job/{job}/{number}/api/json'.format(
             job=self.job, number=last_completed_build['number'])
         req = self.query()
         self.process_json(req.content)
         return
     displayname = json_data['displayName']
     duration = json_data['duration']
     if not isInt(duration):
         raise UnknownError(
             'duration field returned non-integer! {0}'.format(
                 support_msg_api()))
     duration = int(duration) / 1000
     result = json_data['result']
     timestamp = json_data['timestamp']
     if not isInt(timestamp):
         raise UnknownError(
             'timestamp field returned non-integer! {0}'.format(
                 support_msg_api()))
     timestamp = int(timestamp)
     building = json_data['building']
     self.msg += "build {build} status: ".format(build=displayname)
     if building:
         self.unknown()
         self.msg += 'STILL BUILDING!'
         return
     self.msg += result
     if result != 'SUCCESS':
         self.critical()
     self.msg += ', duration={duration} secs'.format(duration=duration)
     self.check_thresholds(duration)
     age = time.time() - (timestamp / 1000)
     self.msg += ', age={age} secs'.format(age=sec2human(age))
     if age < 0:
         self.warning()
         self.msg += ' (< 0!)'
     if self.age and age > self.age:
         self.critical()
         self.msg += ' (> {0:d})'.format(self.age)
     self.msg += ' | build_duration={duration}s{perf_thresholds}'.format(
         duration=duration, perf_thresholds=self.get_perf_thresholds())
 def parse_results(self, content):
     try:
         json_dict = json.loads(content)
         if log.isEnabledFor(logging.DEBUG):
             print(jsonpp(content))
             print('='*80)
         # looks like syshealthok child div is only there in browser, but give syshealthspin in code
         #if soup.find('div', id='syshealthstatus').find('div', id='syshealthok'):
         if not isDict(json_dict):
             raise ValueError("non-dict returned by Attivio AIE server response (type was '{0}')"\
                              .format(type(json_dict)))
         # if this is true from warnings would ruin the more appropriate warnings check
         #if json_dict['haserrors']:
         #    self.critical()
         #    self.msg += 'errors detected, '
         nodes_down = json_dict['nodesdown']
         warnings = json_dict['warnings']
         fatals = json_dict['fatals']
         acknowledged = json_dict['acknowledged']
         if not isInt(nodes_down):
             raise ValueError('non-integer returned for nodes down count by Attivio AIE')
         if not isInt(warnings):
             raise ValueError('non-integer returned for warnings count by Attivio AIE')
         if not isInt(fatals):
             raise ValueError('non-integer returned for fatals count by Attivio AIE')
         if not isInt(acknowledged):
             raise ValueError('non-integer returned for acknowledged count by Attivio AIE')
         nodes_down = int(nodes_down)
         warnings = int(warnings)
         fatals = int(fatals)
         acknowledged = int(acknowledged)
         if nodes_down > 0 or fatals > 0:
             self.critical()
         elif warnings > 0:
             self.warning()
         self.msg += '{nodes_down} nodes down, {fatals} fatals, {warnings} warnings, {acknowledged} acknowledged'\
                     .format(nodes_down=nodes_down, fatals=fatals, warnings=warnings, acknowledged=acknowledged)
         if json_dict['perfmondown']:
             self.warning()
             self.msg += ', warning: performance monitoring down'
         self.msg += ' | nodes_down={nodes_down} fatals={fatals} warnings={warnings} acknowledged={acknowledged}'\
                     .format(nodes_down=nodes_down, fatals=fatals, warnings=warnings, acknowledged=acknowledged)
     except (KeyError, ValueError) as _:
         qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\
                          .format(software=self.software,
                                  exception=type(_).__name__,
                                  error=_,
                                  support_msg=support_msg_api()))
Exemple #7
0
 def mac_getent_passwd_user(self, user):
     log.info('mac_getent_passwd_user(%s)', user)
     command = 'dscl . -read /Users/{user}'.format(user=user)
     (output, returncode) = self.cmd(command)
     user = password = uid = gid = name = homedir = shell = ''
     #log.info('parsing output for passwd conversion')
     output = output.split('\n')
     for (index, line) in enumerate(output):
         tokens = line.split()
         if len(tokens) < 1:
             continue
         field = tokens[0]
         if len(tokens) < 2:
             value = ''
         else:
             value = tokens[1]
         if field == 'RecordName:':
             user = value
         elif field == 'Password:'******'x'
         elif field == 'UniqueID:':
             uid = value
         elif field == 'PrimaryGroupID:':
             gid = value
         elif field == 'RealName:':
             name = value
             if not value and len(output) > index + 1 and output[
                     index + 1].startswith(' '):
                 name = output[index + 1].strip()
         elif not name and field == 'RecordName:':
             name = value
         elif field == 'NFSHomeDirectory:':
             homedir = value
         elif field == 'UserShell:':
             shell = value
     if not user:
         return ('', returncode)
     getent_record = '{user}:{password}:{uid}:{gid}:{name}:hotexamples_com:{shell}'.format\
                     (user=user, password=password, uid=uid, gid=gid, name=name, homedir=homedir, shell=shell)
     if not isInt(uid, allow_negative=True):
         die("parsing error: UID '{uid}' is not numeric in record {record}!"
             .format(uid=uid, record=getent_record))
     if not isInt(gid, allow_negative=True):
         die("parsing error: GID '{gid}' is not numeric in record {record}!"
             .format(gid=gid, record=getent_record))
     return (getent_record, returncode)
Exemple #8
0
 def check_file(self, filename):
     log.debug('checking file \'%s\'', filename)
     match = self.regex.search(os.path.basename(filename))
     if not match:
         log.debug('no numeric regex match for file, probably not a sequential file' + \
                   ', skipping \'%s\'', filename)
         return
     # will error out here if you've supplied your own regex without capture brackets
     # or if you've got pre-captures - let this bubble to user to fix their regex
     file_prefix = os.path.join(os.path.dirname(filename), match.group(1))
     file_number = match.group(2)
     file_suffix = match.group(3)
     if not isInt(file_number):
         raise UnknownError('regex captured non-float for filename: {}'.format(filename))
     if file_prefix is None:
         file_prefix = ''
     if file_suffix is None:
         file_suffix = ''
     padding = len(file_number)
     file_number = int(file_number)
     while file_number > 1:
         file_number = self.determine_missing_file_backfill(file_prefix, file_number, padding, file_suffix)
     if self.missing_files:
         print('\n'.join(reversed(self.missing_files)))
     self.missing_files = []
 def parse_json(self, json_data):
     log.info('parsing response')
     try:
         live_nodes = json_data['beans'][0]['LiveNodes']
         live_node_data = json.loads(live_nodes)
         num_datanodes = len(live_node_data)
         if num_datanodes < 1:
             raise UnknownError("no live datanodes returned by JMX API from namenode '{0}:{1}'"\
                                .format(self.host, self.port))
         min_space = None
         max_space = 0
         for datanode in live_node_data:
             used_space = live_node_data[datanode]['usedSpace']
             if not isInt(used_space):
                 raise UnknownError(
                     'usedSpace is not an integer! {0}'.format(
                         support_msg_api()))
             used_space = int(used_space)
             log.info("datanode '%s' used space = %s", datanode, used_space)
             if min_space is None or used_space < min_space:
                 min_space = used_space
             if used_space > max_space:
                 max_space = used_space
         divisor = max_space
         if divisor < 1:
             log.info(
                 'min used space < 1, resetting divisor to 1 (% will likely be very high)'
             )
             divisor = 1
         assert max_space >= min_space
         largest_imbalance_pc = float('{0:.2f}'.format(
             ((max_space - min_space) / divisor) * 100))
         assert largest_imbalance_pc >= 0
         self.ok()
         self.msg = '{0}% HDFS imbalance on space used'.format(
             largest_imbalance_pc)
         self.check_thresholds(largest_imbalance_pc)
         self.msg += ' across {0:d} datanode{1}'.format(
             num_datanodes, plural(num_datanodes))
         if self.verbose:
             self.msg += ', min used space = {0}, max used space = {1}'.format(
                 min_space, max_space)
         if self.verbose and (self.is_warning() or self.is_critical()):
             self.msg += ' [imbalanced nodes: '
             for datanode in live_node_data:
                 used_space = live_node_data[datanode]['usedSpace']
                 if (used_space / max_space *
                         100) > self.thresholds['warning']['upper']:
                     self.msg += '{0}({1:.2f%}),'.format(
                         datanode, used_space)
             self.msg = self.msg.rstrip(',') + ']'
         self.msg += " | 'HDFS imbalance on space used %'={0}".format(
             largest_imbalance_pc)
         self.msg += self.get_perf_thresholds()
         self.msg += " num_datanodes={0}".format(num_datanodes)
         self.msg += " min_used_space={0}".format(min_space)
         self.msg += " max_used_space={0}".format(max_space)
     except KeyError as _:
         raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\
                            .format(self.host, self.port, _, support_msg_api()))
Exemple #10
0
 def __parse_args__(self):
     try:
         (self.options, self.args) = self.__parser.parse_args()
     # I don't agree with zero exit code from OptionParser for help/usage,
     # and want UNKNOWN not CRITICAL(2) for switch mis-usage...
     except SystemExit:  # pragma: no cover
         sys.exit(ERRORS['UNKNOWN'])
     if self.options.help:  # pragma: no cover
         self.usage()
     if self.options.version:  # pragma: no cover
         print('%(version)s' % self.__dict__)
         sys.exit(ERRORS['UNKNOWN'])
     if 'timeout' in dir(self.options):
         self.timeout = self.get_opt('timeout')
     env_verbose = os.getenv('VERBOSE')
     if isInt(env_verbose):
         if env_verbose > self.verbose:
             log.debug('environment variable $VERBOSE = %s, increasing verbosity', env_verbose)
             self.verbose = env_verbose
     elif env_verbose is None:
         pass
     else:
         log.warning("$VERBOSE environment variable is not an integer ('%s')", env_verbose)
     self.parse_args()
     return self.options, self.args
Exemple #11
0
 def timeout_max(self, secs):
     if secs is not None and not isInt(secs):
         raise CodingError('invalid timeout max passed to set_timeout_max(), must be an integer representing seconds') # pylint: disable=line-too-long
     # leave this to be able to set max to any amount
     # validate_int(secs, 'timeout default', 0, self.__timeout_max )
     log.debug('setting max timeout to %s secs', secs)
     self.__timeout_max = secs
Exemple #12
0
 def run(self):
     url = '{protocol}://{host}:{port}/rest/ingestApi/getSessionCount'.format(
         host=self.host, port=self.port, protocol=self.protocol)
     log.debug('GET %s', url)
     try:
         req = requests.get(url)
     except requests.exceptions.RequestException as _:
         errhint = ''
         if 'BadStatusLine' in str(_.message):
             errhint = ' (possibly connecting to an SSL secured port without using --ssl?)'
         elif self.protocol == 'https' and 'unknown protocol' in str(
                 _.message):
             errhint = ' (possibly connecting to a plain HTTP port with the -S / --ssl switch enabled?)'
         qquit('CRITICAL', str(_) + errhint)
     log.debug("response: %s %s", req.status_code, req.reason)
     log.debug("content:\n%s\n%s\n%s", '=' * 80, req.content.strip(),
               '=' * 80)
     if req.status_code != 200:
         qquit('CRITICAL', '{0} {1}'.format(req.status_code, req.reason))
     try:
         count = req.content.strip()
         if not isInt(count):
             raise ValueError('non-integer value returned by Attivio AIE')
         count = int(count)
         self.msg = '{software} ingest session count = {count}'.format(
             software=self.software, count=count)
         self.check_thresholds(count)
     except (KeyError, ValueError):
         qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\
                          .format(software=self.software,
                                  exception=type(_).__name__,
                                  error=_,
                                  support_msg=support_msg_api()))
     self.msg += ' | ingest_session_count={0:d}{thresholds}'.format(
         count, thresholds=self.get_perf_thresholds())
 def parse(self, content):
     # could also collect lines after 'Regions-in-transition' if parsing /dump
     # sample:
     # hbase:meta,,1.1588230740 state=PENDING_OPEN, \
     # ts=Tue Nov 24 08:26:45 UTC 2015 (1098s ago), server=amb2.service.consul,16020,1448353564099
     soup = BeautifulSoup(content, 'html.parser')
     #if log.isEnabledFor(logging.DEBUG):
     #    log.debug("BeautifulSoup prettified:\n%s\n%s", soup.prettify(), '='*80)
     # looks like HMaster UI doesn't print this section if there are no regions in transition, must assume zero
     regions_in_transition = 0
     try:
         headings = soup.findAll('h2')
         for heading in headings:
             log.debug("checking heading '%s'", heading)
             if heading.get_text() == "Regions in Transition":
                 log.debug('found Regions in Transition section header')
                 table = heading.find_next('table')
                 log.debug('checking first following table')
                 regions_in_transition = self.parse_table(table)
                 if not isInt(regions_in_transition):
                     qquit('UNKNOWN', 'parse error - ' +
                           'got non-integer \'{0}\' for regions in transition when parsing HMaster UI'\
                           .format(regions_in_transition))
         return regions_in_transition
         #qquit('UNKNOWN', 'parse error - failed to find table data for regions in transition')
     except (AttributeError, TypeError):
         qquit(
             'UNKNOWN', 'failed to parse HBase Master UI status page. ' +
             support_msg())
 def process_rows(rows):
     longest_rit_time = None
     # will skip header anyway when it doesn't find td (will contain th instead)
     # this will avoid accidentally skipping a row later if the input changes to rows[1:] instead of rows
     #for row in rows[1:]:
     for row in rows:
         print(row)
         cols = row.findChildren('td')
         # Regions in Transition rows only have 2 cols
         # <hex> region rows have Region, State, RIT time (ms)
         num_cols = len(cols)
         if num_cols == 0:
             # header row
             continue
         elif num_cols != 3:
             qquit('UNKNOWN', 'unexpected number of columns ({0}) '.format(num_cols)
                   + 'for regions in transition table. ' + support_msg())
         if 'Regions in Transition' in cols[0].get_text():
             continue
         rit_time = cols[2].get_text().strip()
         if not isInt(rit_time):
             qquit('UNKNOWN', 'parsing failed, got region in transition time of ' +
                   "'{0}', expected integer".format(rit_time))
         rit_time = int(rit_time)
         if rit_time > longest_rit_time:
             longest_rit_time = rit_time
     return longest_rit_time
Exemple #15
0
    def __parse_verbose__(self):
        self.verbose += int(self.get_opt('verbose'))
        env_verbose = os.getenv('VERBOSE')
        if isInt(env_verbose):
            if env_verbose > self.verbose:
                log.debug(
                    'environment variable $VERBOSE = %s, increasing verbosity',
                    env_verbose)
                self.verbose = int(env_verbose)
        elif env_verbose is None:
            pass
        else:
            log.warning(
                "$VERBOSE environment variable is not an integer ('%s')",
                env_verbose)

        if self.is_option_defined('quiet') and self.get_opt('quiet'):
            self.verbose = 0
        elif self.verbose > 2:
            log.setLevel(logging.DEBUG)
        elif self.verbose > 1:
            log.setLevel(logging.INFO)
        elif self.verbose > 0 and self._prog[0:6] != 'check_':
            log.setLevel(logging.WARN)
        if self.options.debug:
            log.setLevel(logging.DEBUG)  # pragma: no cover
            log.debug('enabling debug logging')
            if self.verbose < 3:
                self.verbose = 3
 def parse_json(self, json_data):
     log.info('parsing response')
     try:
         bean = json_data['beans'][0]
         space_used_pc = bean['PercentUsed']
         # the way below is more informative
         #assert type(space_used_pc) == float
         if re.search(r'e-\d+$', str(space_used_pc)):
             space_used_pc = 0
         if not isFloat(space_used_pc):
             raise UnknownError("non-float returned for PercentUsed by namenode '{0}:{1}'"\
                                .format(self.host, self.port))
         assert space_used_pc >= 0
         stats = {}
         for stat in ('Total', 'TotalBlocks', 'TotalFiles', 'Used'):
             stats[stat] = bean[stat]
             if not isInt(stats[stat]):
                 raise UnknownError("non-integer returned for {0} by namenode '{1}:{2}'"\
                                    .format(stat, self.host, self.port))
             stats[stat] = int(stats[stat])
         self.ok()
         self.msg = 'HDFS space used = {0:.2f}% ({1}/{2})'\
                    .format(space_used_pc, humanize.naturalsize(stats['Used']), humanize.naturalsize(stats['Total']))
         self.check_thresholds(space_used_pc)
         self.msg += ", in {0:d} files spread across {1:d} blocks".format(stats['TotalFiles'], stats['TotalBlocks'])
         self.msg += " | 'HDFS % space used'={0:f}%{1}".format(space_used_pc, self.get_perf_thresholds())
         self.msg += " 'HDFS space used'={0:d}b".format(stats['Used'])
         self.msg += " 'HDFS file count'={0:d}".format(stats['TotalFiles'])
         self.msg += " 'HDFS block count'={0:d}".format(stats['TotalBlocks'])
     except KeyError as _:
         raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\
                            .format(self.host, self.port, _, support_msg_api()))
     except ValueError as _:
         raise UnknownError("invalid json returned for LiveNodes by Namenode '{0}:{1}': {2}"\
                            .format(self.host, self.port, _))
 def parse(self, req):
     soup = BeautifulSoup(req.content, 'html.parser')
     last_heartbeat = None
     try:
         self.list_workers(soup)
         heartbeat_col_header = soup.find(
             'th', text='Node Name').find_next_sibling().get_text()
         # make sure ordering of columns is as we expect so we're parsing the correct number for heartbeat lag
         if heartbeat_col_header != 'Last Heartbeat':
             code_error(
                 "heartbeat column header '{}' != Last Heartbeat".format(
                     heartbeat_col_header))
         last_heartbeat = soup.find(
             'th', text=self.node).find_next_sibling().get_text()
         if last_heartbeat is None:
             raise AttributeError
     except (AttributeError, TypeError):
         raise CriticalError("{0} worker '{1}' not found among list of live workers!"\
                             .format(self.software, self.node))
     if not isInt(last_heartbeat):
         raise UnknownError("last heartbeat '{0}' for node '{1}' is not an integer, possible parsing error! {2}"\
                            .format(last_heartbeat, self.node, support_msg()))
     self.msg = "{0} worker '{1}' last heartbeat = {2} secs ago".format(
         self.software, self.node, last_heartbeat)
     self.check_thresholds(last_heartbeat)
     self.msg += ' | last_heartbeat={0}s{1}'.format(
         last_heartbeat, self.get_perf_thresholds())
Exemple #18
0
 def parse(self, content):
     # could also collect lines after 'Regions-in-transition' if parsing /dump
     # sample:
     # hbase:meta,,1.1588230740 state=PENDING_OPEN, \
     # ts=Tue Nov 24 08:26:45 UTC 2015 (1098s ago), server=amb2.service.consul,16020,1448353564099
     soup = BeautifulSoup(content, 'html.parser')
     #if log.isEnabledFor(logging.DEBUG):
     #    log.debug("BeautifulSoup prettified:\n%s\n%s", soup.prettify(), '='*80)
     # looks like HMaster UI doesn't print this section if there are no regions in transition, must assume zero
     regions_stuck_in_transition = 0
     try:
         headings = soup.findAll('h2')
         for heading in headings:
             log.debug("checking heading '%s'", heading)
             if heading.get_text() == "Regions in Transition":
                 log.debug('found Regions in Transition section header')
                 table = heading.find_next('table')
                 log.debug('checking first following table')
                 regions_stuck_in_transition = self.parse_table(table)
                 if not isInt(regions_stuck_in_transition):
                     qquit('UNKNOWN', 'parse error - ' +
                           'got non-integer \'{0}\' for regions stuck in transition when parsing HMaster UI'\
                           .format(regions_stuck_in_transition))
         return regions_stuck_in_transition
         #qquit('UNKNOWN', 'parse error - failed to find table data for regions stuck in transition')
     except (AttributeError, TypeError):
         qquit('UNKNOWN', 'failed to parse HBase Master UI status page. ' + support_msg())
    def run(self):
        self.no_args()
        host = self.get_opt('host')
        port = self.get_opt('port')
        validate_host(host)
        validate_port(port)

        # observed bug in HDP 2.3 (HBase 1.1.2) where the JMX metric from HMaster UI /jmx is displaying 0 for beans
        # [ {"name":"Hadoop:service=HBase,name=Master,sub=AssignmentManger", ..., "ritCountOverThreshold" : 0 }
        # https://issues.apache.org/jira/browse/HBASE-16636
        #url = 'http://%(host)s:%(port)s/jmx' % locals()
        # could get info from flat txt debug page but it doesn't contain the summary count
        #url = 'http://%(host)s:%(port)s/dump' % locals()
        url = 'http://%(host)s:%(port)s/master-status' % locals()
        log.debug('GET %s', url)
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException as _:
            qquit('CRITICAL', _)
        log.debug("response: %s %s", req.status_code, req.reason)
        log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80)
        if req.status_code != 200:
            qquit('CRITICAL', "%s %s" % (req.status_code, req.reason))
        regions_stuck_in_transition = self.parse(req.content)
        if regions_stuck_in_transition is None:
            qquit('UNKNOWN', 'parse error - failed to find number for regions stuck in transition')
        if not isInt(regions_stuck_in_transition):
            qquit('UNKNOWN', 'parse error - got non-integer for regions stuck in transition when parsing HMaster UI')
        if regions_stuck_in_transition == 0:
            self.ok()
        else:
            self.critical()
        self.msg = '{0} regions stuck in transition (ie. transitioning longer than HBase threshold)'\
                   .format(regions_stuck_in_transition)
        self.msg += " | regions_stuck_in_transition={0};0;0".format(regions_stuck_in_transition)
 def run(self):
     url = '{protocol}://{host}:{port}/rest/ingestApi/getSessionCount'.format(host=self.host,
                                                                              port=self.port,
                                                                              protocol=self.protocol)
     log.debug('GET %s', url)
     try:
         req = requests.get(url)
     except requests.exceptions.RequestException as _:
         errhint = ''
         if 'BadStatusLine' in str(_.message):
             errhint = ' (possibly connecting to an SSL secured port without using --ssl?)'
         elif self.protocol == 'https' and 'unknown protocol' in str(_.message):
             errhint = ' (possibly connecting to a plain HTTP port with the -S / --ssl switch enabled?)'
         qquit('CRITICAL', str(_) + errhint)
     log.debug("response: %s %s", req.status_code, req.reason)
     log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80)
     if req.status_code != 200:
         qquit('CRITICAL', '{0} {1}'.format(req.status_code, req.reason))
     try:
         count = req.content.strip()
         if not isInt(count):
             raise ValueError('non-integer value returned by Attivio AIE')
         count = int(count)
         self.msg = '{software} ingest session count = {count}'.format(software=self.software, count=count)
         self.check_thresholds(count)
     except (KeyError, ValueError):
         qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\
                          .format(software=self.software,
                                  exception=type(_).__name__,
                                  error=_,
                                  support_msg=support_msg_api()))
     self.msg += ' | ingest_session_count={0:d}{thresholds}'.format(count, thresholds=self.get_perf_thresholds())
Exemple #21
0
 def timeout_max(self, secs):
     if secs is not None and not isInt(secs):
         raise CodingError('invalid timeout max passed to set_timeout_max(), must be an integer representing seconds') # pylint: disable=line-too-long
     # leave this to be able to set max to any amount
     # validate_int(secs, 'timeout default', 0, self.__timeout_max )
     log.debug('setting max timeout to %s secs', secs)
     self.__timeout_max = secs
Exemple #22
0
 def collapse_sql_fields(self, row):
     sql_index = self.indicies['sql_index']
     sql_index2 = self.indicies['sql_index2']
     object_index = self.indicies['object_index']
     len_row = len(row)
     if len_row > self.len_headers:
         log.debug('collapsing fields in row: %s', row)
         # divide by 2 to account for this having been done twice in duplicated SQL operational text
         # Update: appears this broke as only 2nd occurence of SQL operational text field got split to new fields,
         # which is weird because the log shows both 1st and 2nd SQL text fields were double quoted
         difference = len_row - self.len_headers
         # seems first occurrence doesn't get split in some occurence,
         # wasn't related to open in newline universal mode though
         # if 2 fields after isn't the /user/hive/warehouse/blah.db then 1st SQL wasn't split
         # would have to regex /user/hive/warehouse/blah.db(?:/table)?
         #if not row[sql_index+2].endswith('.db'):
         # if object field is TABLE or DATABASE then 1st sql field wasn't split
         if row[object_index] not in ('TABLE', 'DATABASE'):
             difference /= 2
             # slice indicies must be integers
             if not isInt(difference):
                 raise AssertionError("difference in field length '{}' is not an integer for row: {}"\
                                     .format(difference, row))
             difference = int(difference)
             row[sql_index] = ','.join(
                 [self.sql_decomment(_) for _ in row[sql_index:difference]])
             row = row[:sql_index] + row[sql_index + difference:]
         row[sql_index2] = ','.join(
             [self.sql_decomment(_) for _ in row[sql_index2:difference]])
         row = row[:sql_index2] + row[sql_index2 + difference:]
         log.debug('collapsed row: %s', row)
     else:
         log.debug('not collapsing row: %s', row)
     return row
Exemple #23
0
    def run(self):
        self.no_args()
        host = self.get_opt('host')
        port = self.get_opt('port')
        validate_host(host)
        validate_port(port)

        # observed bug in HDP 2.3 (HBase 1.1.2) where the JMX metric from HMaster UI /jmx is displaying 0 for beans
        # [ {"name":"Hadoop:service=HBase,name=Master,sub=AssignmentManger", ..., "ritCountOverThreshold" : 0 }
        # https://issues.apache.org/jira/browse/HBASE-16636
        #url = 'http://%(host)s:%(port)s/jmx' % locals()
        # could get info from flat txt debug page but it doesn't contain the summary count
        #url = 'http://%(host)s:%(port)s/dump' % locals()
        url = 'http://%(host)s:%(port)s/master-status' % locals()
        log.debug('GET %s', url)
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException as _:
            qquit('CRITICAL', _)
        log.debug("response: %s %s", req.status_code, req.reason)
        log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80)
        if req.status_code != 200:
            qquit('CRITICAL', "%s %s" % (req.status_code, req.reason))
        regions_stuck_in_transition = self.parse(req.content)
        if regions_stuck_in_transition is None:
            qquit('UNKNOWN', 'parse error - failed to find number for regions stuck in transition')
        if not isInt(regions_stuck_in_transition):
            qquit('UNKNOWN', 'parse error - got non-integer for regions stuck in transition when parsing HMaster UI')
        if regions_stuck_in_transition == 0:
            self.ok()
        else:
            self.critical()
        self.msg = '{0} regions stuck in transition (ie. transitioning longer than HBase threshold)'\
                   .format(regions_stuck_in_transition)
        self.msg += " | regions_stuck_in_transition={0};0;0".format(regions_stuck_in_transition)
Exemple #24
0
 def verbose_default(self, arg):
     if not isInt(arg):
         raise CodingError(
             'invalid verbose level passed to verbose_default(), must be an integer'
         )
     log.debug('setting default verbose to %s', arg)
     self.__verbose_default = int(arg)
 def check_app_elapsed_times(self, app_list):
     num_apps_breaching_sla = 0
     max_elapsed = 0
     matching_apps = 0
     max_threshold_msg = ''
     # save msg as check_thresholds appends to it which we want to reset in this case
     msg = self.msg
     for app in app_list:
         if not self.app_selector(app):
             continue
         name = app['name']
         matching_apps += 1
         elapsed_time = app['elapsedTime']
         assert isInt(elapsed_time)
         elapsed_time = int(elapsed_time / 1000)
         threshold_msg = self.check_thresholds(elapsed_time)
         if threshold_msg:
             num_apps_breaching_sla += 1
             log.info("app '%s' is breaching SLA", name)
         if elapsed_time > max_elapsed:
             max_elapsed = elapsed_time
             max_threshold_msg = threshold_msg
     if max_threshold_msg:
         max_threshold_msg = ' ' + max_threshold_msg
     # restore msg prefix as check_thresholds appends every threshold breach
     self.msg = msg
     return (num_apps_breaching_sla, matching_apps, max_elapsed, max_threshold_msg)
Exemple #26
0
 def __parse_timeout__(self):
     # reset this to none otherwise unit tests fail to take setting from timeout_default
     # use __timeout to bypass the property setter checks
     self.__timeout = None
     if 'timeout' in dir(self.options):
         timeout = self.get_opt('timeout')
         if timeout is not None:
             log.debug('getting --timeout value %s', self.timeout)
             self.timeout = timeout
     if self.timeout is None:
         env_timeout = os.getenv('TIMEOUT')
         log.debug('getting $TIMEOUT value %s', env_timeout)
         if env_timeout is not None:
             log.debug('env_timeout is not None')
             if isInt(env_timeout):
                 log.debug(
                     "environment variable $TIMEOUT = '%s' and timeout not already set, setting timeout = %s",
                     env_timeout, env_timeout)
                 self.timeout = int(env_timeout)
             else:
                 log.warning(
                     "$TIMEOUT environment variable is not an integer ('%s')",
                     env_timeout)
     if self.timeout is None:
         log.debug('timeout not set, using default timeout %s',
                   self.timeout_default)
         self.timeout = self.timeout_default
Exemple #27
0
 def check_app_elapsed_times(self, app_list):
     num_apps_breaching_sla = 0
     max_elapsed = 0
     matching_apps = 0
     max_threshold_msg = ''
     # save msg as check_thresholds appends to it which we want to reset in this case
     msg = self.msg
     for app in app_list:
         if not self.app_selector(app):
             continue
         name = app['name']
         matching_apps += 1
         elapsed_time = app['elapsedTime']
         if not isInt(elapsed_time):
             raise UnknownError(
                 'elapsed_time {} is not an integer!'.format(elapsed_time))
         elapsed_time = int(elapsed_time / 1000)
         threshold_msg = self.check_thresholds(elapsed_time)
         if threshold_msg:
             num_apps_breaching_sla += 1
             log.info("app '%s' is breaching SLA", name)
         if elapsed_time > max_elapsed:
             max_elapsed = elapsed_time
             max_threshold_msg = threshold_msg
     if max_threshold_msg:
         max_threshold_msg = ' ' + max_threshold_msg
     # restore msg prefix as check_thresholds appends every threshold breach
     self.msg = msg
     return (num_apps_breaching_sla, matching_apps, max_elapsed,
             max_threshold_msg)
Exemple #28
0
 def parse_builds(self, content):
     log.debug('parsing build info')
     build = None
     collected_builds = []
     json_data = json.loads(content)
     if not json_data or \
        'builds' not in json_data or \
        not json_data['builds']:
         qquit(
             'UNKNOWN', "no Travis CI builds returned by the Travis API." +
             " Either the specified repo '{0}' doesn't exist".format(
                 self.repo) + " or no builds have happened yet?" +
             " Also remember the repo is case sensitive, for example 'harisekhon/nagios-plugins' returns this"
             +
             " blank build set whereas 'HariSekhon/nagios-plugins' succeeds"
             + " in returning latest builds information")
     builds = json_data['builds']
     # get latest finished failed build
     last_build_number = None
     found_newer_passing_build = False
     for _ in builds:
         # API returns most recent build first
         # extra check to make sure we're getting the very latest build number and API hasn't changed
         build_number = _['number']
         if not isInt(build_number):
             raise UnknownError('build number returned is not an integer!')
         build_number = int(build_number)
         if last_build_number is None:
             last_build_number = int(build_number) + 1
         if build_number >= last_build_number:
             raise UnknownError('build number returned is out of sequence, cannot be >= last build returned' + \
                                '{0}'.format(support_msg_api()))
         last_build_number = build_number
         if self.completed:
             if len(collected_builds) < self.num and _['state'] in (
                     'passed', 'finished', 'failed', 'errored'):
                 collected_builds.append(_)
         elif self.failed:
             if _['state'] == 'passed':
                 if not collected_builds and not found_newer_passing_build:
                     log.warning("found more recent successful build #%s with state = '%s'" + \
                                 ", you may not need to debug this build any more", _['number'], _['state'])
                     found_newer_passing_build = True
             elif _['state'] in ('failed', 'errored'):
                 if len(collected_builds) < self.num:
                     collected_builds.append(_)
                     # by continuing to iterate through the rest of the builds we can check
                     # their last_build numbers are descending for extra sanity checking
                     #break
         elif len(collected_builds) < self.num:
             collected_builds.append(_)
             # by continuing to iterate through the rest of the builds we can check
             # their last_build numbers are descending for extra sanity checking
             #break
     if not collected_builds:
         qquit('UNKNOWN', 'no recent builds found')
     if log.isEnabledFor(logging.DEBUG):
         for build in collected_builds:
             log.debug("build:\n%s", jsonpp(build))
     return collected_builds
    def print_results(self, term, limit=None):
        data = self.search(term, limit)
        results = {}
        longest_name = 8
        try:
            # collect in dict to order by stars like normal docker search command
            for item in data['results']:
                star = item['star_count']
                name = item['name']
                if len(name) > longest_name:
                    longest_name = len(name)
                if not isInt(star):
                    die("star count '{0}' for repo '{1}' is not an integer! {2}"
                        .format(star, name, support_msg_api()))
                results[star] = results.get(star, {})
                results[star][name] = results[star].get(name, {})
                result = {}
                result['description'] = item['description']
                result['official'] = '[OK]' if item['is_official'] else ''
                # docker search doesn't output this so neither will I
                #result['trusted'] = result['is_trusted']
                result['automated'] = '[OK]' if item['is_automated'] else ''
                results[star][name] = result
            # mimicking out spacing from 'docker search' command
            if not self.quiet:
                print('{0:{5}s}   {1:45s}   {2:7s}   {3:8s}   {4:10s}'.format(
                    'NAME', 'DESCRIPTION', 'STARS', 'OFFICIAL', 'AUTOMATED',
                    longest_name))
        except KeyError as _:
            die('failed to parse results fields from data returned by DockerHub '
                + '(format may have changed?): {0}'.format(_))
        except IOError as _:
            if str(_) == '[Errno 32] Broken pipe':
                pass
            else:
                raise

        def truncate(mystr, length):
            if len(mystr) > length:
                mystr = mystr[0:length - 3] + '...'
            return mystr

        for star in reversed(sorted(results)):
            for name in sorted(results[star]):
                if self.quiet:
                    print(name.encode('utf-8'))
                else:
                    desc = truncate(results[star][name]['description'], 45)
                    print('{0:{5}s}   {1:45s}   {2:<7d}   {3:8s}   {4:10s}'.
                          format(name.encode('utf-8'), desc.encode('utf-8'),
                                 star, results[star][name]['official'],
                                 results[star][name]['automated'],
                                 longest_name))
        if self.verbose and not self.quiet:
            try:
                print('\nResults Shown: {0}\nTotal Results: {1}'.format(
                    len(data['results']), data['num_results']))
            except KeyError as _:
                die('failed to parse get total results count from data returned by DockerHub '
                    + '(format may have changed?): {0}'.format(_))
Exemple #30
0
 def __parse_args__(self):
     try:
         (self.options, self.args) = self.__parser.parse_args()
     # I don't agree with zero exit code from OptionParser for help/usage,
     # and want UNKNOWN not CRITICAL(2) for switch mis-usage...
     except SystemExit:  # pragma: no cover
         sys.exit(ERRORS['UNKNOWN'])
     if self.options.help:  # pragma: no cover
         self.usage()
     if self.options.version:  # pragma: no cover
         print('%(version)s' % self.__dict__)
         sys.exit(ERRORS['UNKNOWN'])
     if 'timeout' in dir(self.options):
         self.timeout = self.get_opt('timeout')
     env_verbose = os.getenv('VERBOSE')
     if isInt(env_verbose):
         if env_verbose > self.verbose:
             log.debug('environment variable $VERBOSE = %s, increasing verbosity', env_verbose)
             self.verbose = env_verbose
     elif env_verbose is None:
         pass
     else:
         log.warn("$VERBOSE environment variable is not an integer ('%s')", env_verbose)
     self.parse_args()
     return self.options, self.args
 def parse(json_data):
     try:
         # it's already nicely layed out
         #if log.isEnabledFor(logging.DEBUG):
         #    log.debug('%s', jsonpp(json_data))
         compaction_queue_size = None
         for bean in json_data['beans']:
             if bean['name'] == 'Hadoop:service=HBase,name=RegionServer,sub=Server':
                 if log.isEnabledFor(logging.DEBUG):
                     log.debug('found RegionServer section:')
                     log.debug('%s', jsonpp(bean))
                 compaction_queue_size = bean['compactionQueueLength']
                 if not isInt(compaction_queue_size):
                     qquit(
                         'UNKNOWN',
                         'non-integer returned for compactionQueueLength! '
                         + support_msg_api())
                 return compaction_queue_size
     except KeyError as _:
         qquit(
             'UNKNOWN', _ + ': failed to parse HBase Master jmx info. ' +
             support_msg_api())
     qquit(
         'UNKNOWN',
         'RegionServer mbean not found, double check this is pointing to an HBase RegionServer'
     )
Exemple #32
0
 def process_rows(rows):
     longest_rit_time = None
     # will skip header anyway when it doesn't find td (will contain th instead)
     # this will avoid accidentally skipping a row later if the input changes to rows[1:] instead of rows
     #for row in rows[1:]:
     for row in rows:
         print(row)
         cols = row.findChildren('td')
         # Regions in Transition rows only have 2 cols
         # <hex> region rows have Region, State, RIT time (ms)
         num_cols = len(cols)
         if num_cols == 0:
             # header row
             continue
         elif num_cols != 3:
             qquit('UNKNOWN', 'unexpected number of columns ({0}) '.format(num_cols)
                   + 'for regions in transition table. ' + support_msg())
         if 'Regions in Transition' in cols[0].get_text():
             continue
         rit_time = cols[2].get_text().strip()
         if not isInt(rit_time):
             qquit('UNKNOWN', 'parsing failed, got region in transition time of ' +
                   "'{0}', expected integer".format(rit_time))
         rit_time = int(rit_time)
         if rit_time > longest_rit_time:
             longest_rit_time = rit_time
     return longest_rit_time
Exemple #33
0
 def parse_output(self, content):
     soup = BeautifulSoup(content, 'html.parser')
     if log.isEnabledFor(logging.DEBUG):
         log.debug("BeautifulSoup prettified:\n{0}\n{1}".format(
             soup.prettify(), '=' * 80))
     # shorter to just catch NoneType attribute error when tag not found and returns None
     try:
         basestats = soup.find('div', {'id': 'tab_baseStats'})
         table = basestats.find('table')
         #for table in basestats:
         rows = table.findAll('tr')
         headers = rows[0].findAll('th')
         header_server = headers[0].get_text()
         header_regions = headers[3].get_text()
         wider_table = len(headers) > 4
         # HBase 1.1 in HDP 2.3: ServerName | Start time | Requests Per Second | Num. Regions
         # HBase 1.2 (Apache):   ServerName | Start time | Version | Requests per Second | Num. Regions
         if wider_table:
             header_regions = headers[4].get_text()
         if header_server != 'ServerName':
             qquit(
                 'UNKNOWN', "Table headers in Master UI have changed" +
                 " (got {0}, expected 'ServerName'). ".format(header_server)
                 + support_msg())
         if header_regions != 'Num. Regions':
             qquit(
                 'UNKNOWN', "Table headers in Master UI have changed" +
                 " (got {0}, expected 'Num. Regions'). ".format(
                     header_regions) + support_msg())
         log.debug('%-50s\tnum_regions', 'server')
         for row in rows[1:]:
             # this can be something like:
             # 21689588ba40,16201,1473775984259
             # so don't apply isHost() validation because it'll fail FQDN / IP address checks
             cols = row.findAll('td')
             server = cols[0].get_text()
             if self.total_regex.match(server):
                 continue
             num_regions = cols[3].get_text()
             if wider_table:
                 num_regions = cols[4].get_text()
             if not isInt(num_regions):
                 qquit(
                     'UNKNOWN',
                     "parsing error - got '{0}' for num regions".format(
                         num_regions) +
                     " for server '{1}', was expecting integer.".format(
                         server) + " UI format must have changed" +
                     support_msg())
             num_regions = int(num_regions)
             log.debug('%-50s\t%s', server, num_regions)
             if self.server_min_regions[
                     1] is None or num_regions < self.server_min_regions[1]:
                 self.server_min_regions = (server, num_regions)
             if self.server_max_regions[
                     1] is None or num_regions > self.server_max_regions[1]:
                 self.server_max_regions = (server, num_regions)
     except (AttributeError, TypeError, IndexError):
         qquit('UNKNOWN', 'failed to find parse output')
Exemple #34
0
 def parse_json(self, json_data):
     sms_credits = json_data['credits']['availablesms']
     if not isInt(sms_credits):
         raise UnknownError('Pingdom API returned non-integer for availablesms field')
     self.msg = 'Pingdom SMS credits available: {}'.format(sms_credits)
     self.check_thresholds(sms_credits)
     self.msg += ' | sms_credits={}'.format(sms_credits)
     self.msg += self.get_perf_thresholds(boundary='lower')
 def parse_json(self, json_data):
     num_executors = json_data['numExecutors']
     if not isInt(num_executors):
         raise UnknownError('non-integer returned by Jenkins. {0}'.format(support_msg_api()))
     self.msg += '{:d}'.format(num_executors)
     self.check_thresholds(num_executors)
     self.msg += ' | num_executors={0:d}'.format(num_executors)
     self.msg += self.get_perf_thresholds(boundary='lower')
Exemple #36
0
 def parse_json(self, json_data):
     num_executors = json_data['numExecutors']
     if not isInt(num_executors):
         raise UnknownError('non-integer returned by Jenkins. {0}'.format(support_msg_api()))
     self.msg += '{:d}'.format(num_executors)
     self.check_thresholds(num_executors)
     self.msg += ' | num_executors={0:d}'.format(num_executors)
     self.msg += self.get_perf_thresholds(boundary='lower')
 def parse_json(self, json_data):
     if self.list_jobs:
         print('Jenkins Jobs:\n')
         for job in json_data['jobs']:
             print(job['name'])
         sys.exit(ERRORS['UNKNOWN'])
     if 'lastCompletedBuild' in json_data:
         last_completed_build = json_data['lastCompletedBuild']
         if not last_completed_build:
             raise WarningError("job '{job}' not built yet".format(job=self.job))
         self.path = '/job/{job}/{number}/api/json'.format(job=self.job,
                                                           number=last_completed_build['number'])
         req = self.query()
         self.process_json(req.content)
         return
     displayname = json_data['displayName']
     duration = json_data['duration']
     if not isInt(duration):
         raise UnknownError('duration field returned non-integer! {0}'.format(support_msg_api()))
     duration = int(duration) / 1000
     result = json_data['result']
     timestamp = json_data['timestamp']
     if not isInt(timestamp):
         raise UnknownError('timestamp field returned non-integer! {0}'.format(support_msg_api()))
     timestamp = int(timestamp)
     building = json_data['building']
     self.msg += "build {build} status: ".format(build=displayname)
     if building:
         self.unknown()
         self.msg += 'STILL BUILDING!'
         return
     self.msg += result
     if result != 'SUCCESS':
         self.critical()
     self.msg += ', duration={duration} secs'.format(duration=duration)
     self.check_thresholds(duration)
     age = time.time() - (timestamp/1000)
     self.msg += ', age={age} secs'.format(age=sec2human(age))
     if age < 0:
         self.warning()
         self.msg += ' (< 0!)'
     if self.age and age > self.age:
         self.critical()
         self.msg += ' (> {0:d})'.format(self.age)
     self.msg += ' | build_duration={duration}s{perf_thresholds}'.format(duration=duration,
                                                                         perf_thresholds=self.get_perf_thresholds())
Exemple #38
0
 def mac_getent_passwd_user(self, user):
     log.info('mac_getent_passwd_user(%s)', user)
     command = 'dscl . -read /Users/{user}'.format(user=user)
     (output, returncode) = self.cmd(command)
     user = password = uid = gid = name = homedir = shell = ''
     #log.info('parsing output for passwd conversion')
     output = output.split('\n')
     for (index, line) in enumerate(output):
         tokens = line.split()
         if len(tokens) < 1:
             continue
         field = tokens[0]
         if len(tokens) < 2:
             value = ''
         else:
             value = tokens[1]
         if field == 'RecordName:':
             user = value
         elif field == 'Password:'******'x'
         elif field == 'UniqueID:':
             uid = value
         elif field == 'PrimaryGroupID:':
             gid = value
         elif field == 'RealName:':
             name = value
             if not value and len(output) > index + 1 and output[index+1].startswith(' '):
                 name = output[index+1].strip()
         elif not name and field == 'RecordName:':
             name = value
         elif field == 'NFSHomeDirectory:':
             homedir = value
         elif field == 'UserShell:':
             shell = value
     if not user:
         return('', returncode)
     getent_record = '{user}:{password}:{uid}:{gid}:{name}:hotexamples_com:{shell}'.format\
                     (user=user, password=password, uid=uid, gid=gid, name=name, homedir=homedir, shell=shell)
     if not isInt(uid, allow_negative=True):
         die("parsing error: UID '{uid}' is not numeric in record {record}!".format(uid=uid, record=getent_record))
     if not isInt(gid, allow_negative=True):
         die("parsing error: GID '{gid}' is not numeric in record {record}!".format(gid=gid, record=getent_record))
     return (getent_record, returncode)
Exemple #39
0
 def check_ping(host, count=None, wait=None):
     if count is None:
         count = 1
     if wait is None:
         wait = 3
     if not isInt(count):
         raise UnknownError("passed invalid count '{0}' to check_ping method, must be a valid integer!"\
                            .format(count))
     if not isInt(wait):
         raise UnknownError("passed invalid wait '{0}' to check_ping method, must be a valid integer!"\
                            .format(wait))
     log.info("pinging host '%s' (count=%s, wait=%s)", host, count, wait)
     count_switch = '-c'
     if platform.system().lower() == 'windows':
         count_switch = '-n'
     wait_switch = '-w'
     if platform.system().lower() == 'darwin':
         wait_switch = '-W'
     # causes hang if count / wait are not cast to string
     cmd = [
         'ping', count_switch, '{0}'.format(count), wait_switch,
         '{0}'.format(wait), host
     ]
     log.debug('cmd: %s', ' '.join(cmd))
     #log.debug('args: %s', cmd)
     try:
         process = subprocess.Popen(cmd,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
         #log.debug('communicating')
         (stdout, stderr) = process.communicate()
         #log.debug('waiting for child process')
         process.wait()
         exitcode = process.returncode
         log.debug('stdout: %s', stdout)
         log.debug('stderr: %s', stderr)
         log.debug('exitcode: %s', exitcode)
         if exitcode == 0:
             log.info("host '%s' responded to ping", host)
             return host
     except subprocess.CalledProcessError as _:
         log.warn('ping failed: %s', _.output)
     except OSError as _:
         die('error calling ping: {0}'.format(_))
     return None
    def process_result(self, result):
        _id = result['id']
        log.info('latest build id: %s', _id)

        status = result['status']
        log.info('status: %s', status)
        if not isInt(status, allow_negative=True):
            raise UnknownError(
                'non-integer status returned by DockerHub API. {0}'.format(
                    support_msg_api()))

        tag = result['dockertag_name']
        log.info('tag: %s', tag)

        trigger = result['cause']
        log.info('trigger: %s', trigger)

        created_date = result['created_date']
        log.info('created date: %s', created_date)

        last_updated = result['last_updated']
        log.info('last updated: %s', last_updated)

        created_datetime = datetime.datetime.strptime(
            created_date.split('.')[0], '%Y-%m-%dT%H:%M:%S')
        updated_datetime = datetime.datetime.strptime(
            last_updated.split('.')[0], '%Y-%m-%dT%H:%M:%S')
        build_latency_timedelta = updated_datetime - created_datetime
        build_latency = build_latency_timedelta.total_seconds()
        log.info('build latency (creation to last updated): %s', build_latency)
        # results in .0 floats anyway
        build_latency = int(build_latency)

        build_code = result['build_code']
        build_url = 'https://hub.docker.com/r/{0}/builds/{1}'.format(
            self.repo, build_code)
        log.info('latest build URL: %s', build_url)

        if str(status) in self.statuses:
            status = self.statuses[str(status)]
        else:
            log.warning("status code '%s' not recognized! %s", status,
                        support_msg_api())
            log.warning('defaulting to assume status is an Error')
            status = 'Error'
        if status != 'Success':
            self.critical()
        self.msg += "'{repo}' last completed build status: '{status}', tag: '{tag}', build code: {build_code}"\
                    .format(repo=self.repo, status=status, tag=tag, build_code=build_code)
        if self.verbose:
            self.msg += ', id: {0}'.format(_id)
            self.msg += ', trigger: {0}'.format(trigger)
            self.msg += ', created date: {0}'.format(created_date)
            self.msg += ', last updated: {0}'.format(last_updated)
            self.msg += ', build_latency: {0}'.format(sec2human(build_latency))
            self.msg += ', build URL: {0}'.format(build_url)
        self.msg += ' | build_latency={0:d}s'.format(build_latency)
 def parse_json(self, json_data):
     stats_db_event_queue = json_data['statistics_db_event_queue']
     if not isInt(stats_db_event_queue):
         raise UnknownError("non-integer stats db event queue returned ('{0}'). {1}"\
                            .format(stats_db_event_queue, support_msg_api()))
     stats_db_event_queue = int(stats_db_event_queue)
     self.msg = "{0} stats dbs event queue = {1}".format(self.name, stats_db_event_queue)
     self.check_thresholds(stats_db_event_queue)
     self.msg += " | stats_db_event_queue={0}".format(stats_db_event_queue)
     self.msg += self.get_perf_thresholds()
Exemple #42
0
 def timeout_default(self, secs):
     if secs is not None:
         if not isInt(secs):
             raise CodingError('invalid timeout passed to timeout_default = , must be an integer representing seconds') # pylint: disable=line-too-long
         # validate_int(secs, 'timeout default', 0, self.__timeout_max )
         if self.timeout_max is not None and secs > self.timeout_max:
             raise CodingError('set default timeout > timeout max')
         secs = int(secs)
     log.debug('setting default timeout to %s secs', secs)
     self.__timeout_default = secs
Exemple #43
0
 def timeout_default(self, secs):
     if secs is not None:
         if not isInt(secs):
             raise CodingError('invalid timeout passed to timeout_default = , must be an integer representing seconds') # pylint: disable=line-too-long
         # validate_int(secs, 'timeout default', 0, self.__timeout_max )
         if self.timeout_max is not None and secs > self.timeout_max:
             raise CodingError('set default timeout > timeout max')
         secs = int(secs)
     log.debug('setting default timeout to %s secs', secs)
     self.__timeout_default = secs
Exemple #44
0
 def __init__(self):
     # instance attributes, feels safer
     self.name = None
     self.options = None
     self.args = None
     self.__verbose = None
     self.__verbose_default = 0
     self.__timeout = None
     self.__timeout_default = 10
     self.__timeout_max = 86400
     self.__total_run_time = time.time()
     self.topfile = get_topfile()
     self._docstring = get_file_docstring(self.topfile)
     if self._docstring:
         self._docstring = '\n' + self._docstring.strip() + '\n'
     if self._docstring is None:
         self._docstring = ''
     self._topfile_version = get_file_version(self.topfile)
     # this doesn't work in unit tests
     # if self._topfile_version:
     #     raise CodingError('failed to get topfile version - did you set a __version__ in top cli program?') # pylint: disable=line-too-long
     self._cli_version = self.__version__
     self._utils_version = harisekhon.utils.__version__
     # returns 'python -m unittest' :-/
     # prog = os.path.basename(sys.argv[0])
     self._prog = os.path.basename(self.topfile)
     self._github_repo = get_file_github_repo(self.topfile)
     # _hidden attributes are shown in __dict__
     self.version = '{prog} version {topfile_version} '.format(prog=self._prog,
                                                               topfile_version=self._topfile_version) + \
                    '=>  CLI version {cli_version} '.format(cli_version=self._cli_version) + \
                    '=>  Utils version {utils_version}'.format(utils_version=self._utils_version)
     self.usagemsg = 'Hari Sekhon{sep}{github_repo}\n\n{prog}\n{docstring}\n'.format(\
                         sep=' - ' if self._github_repo else '',
                         github_repo=self._github_repo,
                         prog=self._prog,
                         docstring=self._docstring)
     self.usagemsg_short = 'Hari Sekhon%(_github_repo)s\n\n' % self.__dict__
     # set this in simpler client programs when you don't want to exclude
     # self.__parser = OptionParser(usage=self.usagemsg_short, version=self.version)
     # self.__parser = OptionParser(version=self.version)
     # will be added by default_opts later so that it's not annoyingly at the top of the option help
     # also this allows us to print full docstring for a complete description and not just the cli switches
     # description=self._docstring # don't want description printed for option errors
     width = os.getenv('COLUMNS', None)
     if not isInt(width) or not width:
         try:
             width = Terminal().width
         except _curses.error:
             width = 80
     width = min(width, 200)
     self.__parser = OptionParser(add_help_option=False, formatter=IndentedHelpFormatter(width=width))
     # duplicate key error or duplicate options, sucks
     # self.__parser.add_option('-V', dest='version', help='Show version and exit', action='store_true')
     self.setup()
 def parse_json(self, json_data):
     gcs = json_data['systemDiagnostics']['aggregateSnapshot']['garbageCollection']
     gc_millis = max([_['collectionMillis'] for _ in gcs])
     if not isInt(gc_millis):
         raise CriticalError('collectionMillis \'{}\' is not an integer!!'.format(gc_millis))
     gc_millis = int(gc_millis)
     gc_secs = '{:.2f}'.format(gc_millis / 1000)
     self.ok()
     self.msg = 'Nifi Java GC last collection time = {} secs'.format(gc_secs)
     self.check_thresholds(gc_secs)
     self.msg += ' | gc_collection={}s{}'.format(gc_secs, self.get_perf_thresholds())
 def parse_json(self, json_data):
     stats_db_event_queue = json_data['statistics_db_event_queue']
     if not isInt(stats_db_event_queue):
         raise UnknownError("non-integer stats db event queue returned ('{0}'). {1}"\
                            .format(stats_db_event_queue, support_msg_api()))
     stats_db_event_queue = int(stats_db_event_queue)
     self.msg = "{0} stats dbs event queue = {1}".format(
         self.name, stats_db_event_queue)
     self.check_thresholds(stats_db_event_queue)
     self.msg += " | stats_db_event_queue={0}".format(stats_db_event_queue)
     self.msg += self.get_perf_thresholds()
Exemple #47
0
 def parse_json(self, json_data):
     processors = json_data['systemDiagnostics']['aggregateSnapshot']['availableProcessors']
     if not isInt(processors):
         raise CriticalError('availableProcessors \'{}\' is not an integer!!'.format(processors))
     processors = int(processors)
     if processors > 0:
         self.ok()
         self.msg = 'Nifi status = OK, processors available'
     else:
         self.critical()
         self.msg = 'Nifi status = CRITICAL, no processors available'
 def check_app(self, app):
     state = app['state']
     user = app['user']
     queue = app['queue']
     # Hadoop 2.2 doesn't have this field
     running_containers = None
     if 'runningContainers' in app:
         running_containers = app['runningContainers']
         if not isInt(running_containers, allow_negative=True):
             raise UnknownError('running_containers {} is not an integer!'.format(running_containers))
         running_containers = int(running_containers)
     elapsed_time = app['elapsedTime']
     if not isInt(elapsed_time):
         raise UnknownError('elapsed time {} is not an integer'.format(elapsed_time))
     elapsed_time = int(elapsed_time / 1000)
     self.msg = "Yarn application '{0}' state = '{1}'".format(app['name'], state)
     if state != 'RUNNING':
         self.critical()
     ##################
     # This shouldn't be used any more now using more targeted query to only return running apps
     # state = FAILED / KILLED also gets final status = FAILED KILLED, no point double printing
     if state == 'FINISHED':
         self.msg += ", final status = '{0}'".format(app['finalStatus'])
     ##################
     self.msg += ", user = '******'".format(user)
     if self.app_user is not None and self.app_user != user:
         self.critical()
         self.msg += " (expected '{0}')".format(self.app_user)
     self.msg += ", queue = '{0}'".format(queue)
     if self.queue is not None and self.queue != queue:
         self.critical()
         self.msg += " (expected '{0}')".format(self.queue)
     if running_containers is not None:
         self.msg += ", running containers = {0}".format(running_containers)
         if self.min_containers is not None and running_containers < self.min_containers:
             self.critical()
             self.msg += " (< '{0}')".format(self.min_containers)
     self.msg += ", elapsed time = {0} secs".format(elapsed_time)
     self.check_thresholds(elapsed_time)
     return elapsed_time
 def parse_json(self, json_data):
     log.info('parsing response')
     try:
         live_nodes = json_data['beans'][0]['LiveNodes']
         live_node_data = json.loads(live_nodes)
         num_datanodes = len(live_node_data)
         if num_datanodes < 1:
             raise CriticalError("no live datanodes returned by JMX API from namenode '{0}:{1}'"\
                                 .format(self.host, self.port))
         min_space = None
         max_space = 0
         for datanode in live_node_data:
             used_space = live_node_data[datanode]['usedSpace']
             if not isInt(used_space):
                 raise UnknownError('usedSpace {} is not an integer! {}'.format(used_space, support_msg_api()))
             used_space = int(used_space)
             log.info("datanode '%s' used space = %s", datanode, used_space)
             if min_space is None or used_space < min_space:
                 min_space = used_space
             if used_space > max_space:
                 max_space = used_space
         divisor = max_space
         if divisor < 1:
             log.info('min used space < 1, resetting divisor to 1 (% will likely be very high)')
             divisor = 1
         if max_space < min_space:
             raise UnknownError('max_space < min_space')
         largest_imbalance_pc = float('{0:.2f}'.format(((max_space - min_space) / divisor) * 100))
         if largest_imbalance_pc < 0:
             raise UnknownError('largest_imbalance_pc < 0')
         self.ok()
         self.msg = '{0}% HDFS imbalance on space used'.format(largest_imbalance_pc)
         self.check_thresholds(largest_imbalance_pc)
         self.msg += ' across {0:d} datanode{1}'.format(num_datanodes, plural(num_datanodes))
         if self.verbose:
             self.msg += ', min used space = {0}, max used space = {1}'.format(min_space, max_space)
         if self.verbose and (self.is_warning() or self.is_critical()):
             self.msg += ' [imbalanced nodes: '
             for datanode in live_node_data:
                 used_space = live_node_data[datanode]['usedSpace']
                 if (used_space / max_space * 100) > self.thresholds['warning']['upper']:
                     self.msg += '{0}({1:.2f%}),'.format(datanode, used_space)
             self.msg = self.msg.rstrip(',') + ']'
         self.msg += " | 'HDFS imbalance on space used %'={0}".format(largest_imbalance_pc)
         self.msg += self.get_perf_thresholds()
         self.msg += " num_datanodes={0}".format(num_datanodes)
         self.msg += " min_used_space={0}".format(min_space)
         self.msg += " max_used_space={0}".format(max_space)
     except KeyError as _:
         raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\
                            .format(self.host, self.port, _, support_msg_api()))
    def process_result(self, result):
        _id = result['id']
        log.info('latest build id: %s', _id)

        status = result['status']
        log.info('status: %s', status)
        if not isInt(status, allow_negative=True):
            raise UnknownError('non-integer status returned by DockerHub API. {0}'.format(support_msg_api()))

        tag = result['dockertag_name']
        log.info('tag: %s', tag)

        trigger = result['cause']
        log.info('trigger: %s', trigger)

        created_date = result['created_date']
        log.info('created date: %s', created_date)

        last_updated = result['last_updated']
        log.info('last updated: %s', last_updated)

        created_datetime = datetime.datetime.strptime(created_date.split('.')[0], '%Y-%m-%dT%H:%M:%S')
        updated_datetime = datetime.datetime.strptime(last_updated.split('.')[0], '%Y-%m-%dT%H:%M:%S')
        build_latency_timedelta = updated_datetime - created_datetime
        build_latency = build_latency_timedelta.total_seconds()
        log.info('build latency (creation to last updated): %s', build_latency)
        # results in .0 floats anyway
        build_latency = int(build_latency)

        build_code = result['build_code']
        build_url = 'https://hub.docker.com/r/{0}/builds/{1}'.format(self.repo, build_code)
        log.info('latest build URL: %s', build_url)

        if str(status) in self.statuses:
            status = self.statuses[str(status)]
        else:
            log.warning("status code '%s' not recognized! %s", status, support_msg_api())
            log.warning('defaulting to assume status is an Error')
            status = 'Error'
        if status != 'Success':
            self.critical()
        self.msg += "'{repo}' last completed build status: '{status}', tag: '{tag}', build code: {build_code}"\
                    .format(repo=self.repo, status=status, tag=tag, build_code=build_code)
        if self.verbose:
            self.msg += ', id: {0}'.format(_id)
            self.msg += ', trigger: {0}'.format(trigger)
            self.msg += ', created date: {0}'.format(created_date)
            self.msg += ', last updated: {0}'.format(last_updated)
            self.msg += ', build_latency: {0}'.format(sec2human(build_latency))
            self.msg += ', build URL: {0}'.format(build_url)
        self.msg += ' | build_latency={0:d}s'.format(build_latency)
 def get_request_ids(self):
     content = self.get('/clusters/{cluster}/requests'.format(cluster=self.cluster))
     try:
         _ = json.loads(content)
         request_ids = []
         for item in _['items']:
             if item['Requests']['cluster_name'] == self.cluster:
                 request_id = item['Requests']['id']
                 if not isInt(request_id):
                     die('request id returned was not an integer! ' + support_msg_api())
                 request_ids.append(request_id)
         return request_ids
     except (KeyError, ValueError) as _:
         die('failed to parse response for request IDs: {0}. '.format(_) + support_msg_api())
 def parse_json(self, json_data):
     log.info('parsing response')
     try:
         live_nodes_str = json_data['beans'][0]['LiveNodes']
         dead_nodes_str = json_data['beans'][0]['DeadNodes']
         decom_nodes_str = json_data['beans'][0]['DecomNodes']
         live_nodes = json.loads(live_nodes_str)
         dead_nodes = json.loads(dead_nodes_str)
         decom_nodes = json.loads(decom_nodes_str)
         self.print_nodes(live_nodes=live_nodes,
                          dead_nodes=dead_nodes,
                          decom_nodes=decom_nodes)
         last_contact_secs = None
         for item in live_nodes:
             if self.match_datanode(self.datanode, item):
                 last_contact_secs = live_nodes[item]['lastContact']
         # always check decom and dead nodes regardless if last_contact_secs was found in live nodes
         # gives an additional safety check to escalate to warning / critical
         self.msg = ''
         for item in decom_nodes:
             if self.match_datanode(self.datanode, item):
                 last_contact_secs = decom_nodes[item]['lastContact']
                 self.warning()
                 self.msg = 'Decommissioning '
         for item in dead_nodes:
             if self.match_datanode(self.datanode, item):
                 last_contact_secs = dead_nodes[item]['lastContact']
                 self.critical()
                 self.msg = 'Dead '
         if last_contact_secs is None:
             raise UnknownError("datanode '{0}' is not present in any of the live, ".format(self.datanode) + \
                                "decommissioning or dead node lists!")
         if not isInt(last_contact_secs):
             raise UnknownError("non-integer '{0}' returned for last contact seconds by namenode '{1}:{2}'"\
                                .format(last_contact_secs, self.host, self.port))
         last_contact_secs = int(last_contact_secs)
         if last_contact_secs < 0:
             raise UnknownError('last_contact_secs {} < 0!'.format(last_contact_secs))
         self.msg += "HDFS datanode '{0}' last contact with namenode was {1} sec{2} ago"\
                    .format(self.datanode, last_contact_secs, plural(last_contact_secs))
         self.check_thresholds(last_contact_secs)
         self.msg += ' | datanode_last_contact_secs={0}'.format(last_contact_secs)
         self.msg += self.get_perf_thresholds()
     except KeyError as _:
         raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\
                            .format(self.host, self.port, _, support_msg_api()))
     except ValueError as _:
         raise UnknownError("invalid json returned for LiveNodes by Namenode '{0}:{1}': {2}"\
                            .format(self.host, self.port, _))
Exemple #53
0
 def __init__(self):
     # Python 2.x
     super(HeadTail, self).__init__()
     # Python 3.x
     # super().__init__()
     self.default_num_lines = 10
     # this is usually None unless you explicitly 'export LINES'
     lines_env_var = os.getenv('LINES')
     if lines_env_var and isInt(lines_env_var):
         self.default_num_lines = int(int(lines_env_var) / 2) - 1
     self.num_lines = self.default_num_lines
     #self.sep = '...'
     self.sep = '-' * 80
     self.docsep = '=' * 80
     self.quiet = False
 def parse_json(self, json_data):
     log.info('parsing response')
     try:
         data = json_data['beans'][0]
         total_blocks = data['TotalBlocks']
         if not isInt(total_blocks):
             raise UnknownError('non-integer returned by NameNode for number of total blocks! {0}'\
                                .format(support_msg_api()))
         total_blocks = int(total_blocks)
         self.msg = 'HDFS Total Blocks = {0:d}'.format(total_blocks)
         self.check_thresholds(total_blocks)
         self.msg += ' | hdfs_total_blocks={0:d}{1}'.format(total_blocks, self.get_perf_thresholds())
     except KeyError as _:
         raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\
                            .format(self.host, self.port, _, support_msg_api()))
Exemple #55
0
 def parse_output(self, content):
     soup = BeautifulSoup(content, 'html.parser')
     if log.isEnabledFor(logging.DEBUG):
         log.debug("BeautifulSoup prettified:\n{0}\n{1}".format(soup.prettify(), '='*80))
     # shorter to just catch NoneType attribute error when tag not found and returns None
     try:
         basestats = soup.find('div', {'id': 'tab_baseStats'})
         table = basestats.find('table')
         #for table in basestats:
         rows = table.findAll('tr')
         headers = rows[0].findAll('th')
         header_server = headers[0].get_text()
         header_regions = headers[3].get_text()
         wider_table = len(headers) > 4
         # HBase 1.1 in HDP 2.3: ServerName | Start time | Requests Per Second | Num. Regions
         # HBase 1.2 (Apache):   ServerName | Start time | Version | Requests per Second | Num. Regions
         if wider_table:
             header_regions = headers[4].get_text()
         if header_server != 'ServerName':
             qquit('UNKNOWN', "Table headers in Master UI have changed" +
                   " (got {0}, expected 'ServerName'). ".format(header_server) + support_msg())
         if header_regions != 'Num. Regions':
             qquit('UNKNOWN', "Table headers in Master UI have changed" +
                   " (got {0}, expected 'Num. Regions'). ".format(header_regions) + support_msg())
         log.debug('%-50s\tnum_regions', 'server')
         for row in rows[1:]:
             # this can be something like:
             # 21689588ba40,16201,1473775984259
             # so don't apply isHost() validation because it'll fail FQDN / IP address checks
             cols = row.findAll('td')
             server = cols[0].get_text()
             if self.total_regex.match(server):
                 continue
             num_regions = cols[3].get_text()
             if wider_table:
                 num_regions = cols[4].get_text()
             if not isInt(num_regions):
                 qquit('UNKNOWN', "parsing error - got '{0}' for num regions".format(num_regions) +
                       " for server '{1}', was expecting integer.".format(server) +
                       " UI format must have changed" + support_msg())
             num_regions = int(num_regions)
             log.debug('%-50s\t%s', server, num_regions)
             if self.server_min_regions[1] is None or num_regions < self.server_min_regions[1]:
                 self.server_min_regions = (server, num_regions)
             if self.server_max_regions[1] is None or num_regions > self.server_max_regions[1]:
                 self.server_max_regions = (server, num_regions)
     except (AttributeError, TypeError, IndexError):
         qquit('UNKNOWN', 'failed to find parse output')
 def parse_json(self, json_data):
     gc_times = []
     for bean in json_data['beans']:
         if 'name' in bean and bean['name'][:37] == 'java.lang:type=GarbageCollector,name=':
             last_gc_info = bean['LastGcInfo']
             if last_gc_info and 'duration' in last_gc_info and isInt(last_gc_info['duration']):
                 gc_times.append(int(last_gc_info['duration']))
     if not gc_times:
         raise UnknownError('no Java GC times found')
     gc_millis = max(gc_times)
     gc_millis = int(gc_millis)
     gc_secs = '{:.2f}'.format(gc_millis / 1000)
     self.ok()
     self.msg = '{} Java GC last duration = {} secs'.format(self.name[0], gc_secs)
     self.check_thresholds(gc_secs)
     self.msg += ' | gc_duration={}s{}'.format(gc_secs, self.get_perf_thresholds())
 def parse_json(self, json_data):
     log.info('parsing response')
     try:
         live_nodes = json_data['beans'][0]['LiveNodes']
         live_node_data = json.loads(live_nodes)
         num_datanodes = len(live_node_data)
         if num_datanodes < 1:
             raise CriticalError("no live datanodes returned by JMX API from namenode '{0}:{1}'"\
                                 .format(self.host, self.port))
         max_blocks = 0
         min_blocks = None
         for datanode in live_node_data:
             blocks = live_node_data[datanode]['numBlocks']
             if not isInt(blocks):
                 raise UnknownError('numBlocks {} is not an integer! {}'.format(blocks, support_msg_api()))
             blocks = int(blocks)
             log.info("datanode '%s' has %s blocks", datanode, blocks)
             if blocks > max_blocks:
                 max_blocks = blocks
             if min_blocks is None or blocks < min_blocks:
                 min_blocks = blocks
         log.info("max blocks on a single datanode = %s", max_blocks)
         log.info("min blocks on a single datanode = %s", min_blocks)
         if min_blocks is None:
             raise UnknownError('min_blocks is None')
         divisor = min_blocks
         if min_blocks < 1:
             log.info("min blocks < 1, resetting divisor to 1 (% will be very high)")
             divisor = 1
         block_imbalance = float("{0:.2f}".format((max_blocks - min_blocks) / divisor * 100))
         self.msg = '{0}% block imbalance across {1} datanode{2}'\
                    .format(block_imbalance, num_datanodes, plural(num_datanodes))
         self.ok()
         self.check_thresholds(block_imbalance)
         if self.verbose:
             self.msg += ' (min blocks = {0}, max blocks = {1})'.format(min_blocks, max_blocks)
         self.msg += " | block_imbalance={0}%".format(block_imbalance)
         self.msg += self.get_perf_thresholds()
         self.msg += " num_datanodes={0}".format(num_datanodes)
         self.msg += " min_blocks={0}".format(min_blocks)
         self.msg += " max_blocks={0}".format(max_blocks)
     except KeyError as _:
         raise UnknownError("failed to parse json returned by NameNode at '{0}:{1}': {2}. {3}"\
                            .format(self.host, self.port, _, support_msg_api()))
     except ValueError as _:
         raise UnknownError("invalid json returned for LiveNodes by Namenode '{0}:{1}': {2}"\
                            .format(self.host, self.port, _))
    def run(self):
        server_url = '{proto}://{host}:{port}'.format(proto=self.protocol, host=self.host, port=self.port)
        try:
            log.debug('setting up Jenkins connection to %s', server_url)
            start_time = time.time()
            server = jenkins.Jenkins(server_url, username=self.user, password=self.password, timeout=self.timeout / 3)
            if log.isEnabledFor(logging.DEBUG):
                log.debug('getting user')
                user = server.get_whoami()
                log.debug('connected as user %s', jsonpp(user))
            if self.list_nodes:
                log.debug('getting Jenkins nodes')
                nodes = server.get_nodes()
                log.debug('nodes: %s', nodes)
                print('Jenkins nodes:\n')
                for _ in nodes:
                    print(_['name'])
                sys.exit(ERRORS['UNKNOWN'])
            # doesn't find 'master' node despite showing it in the list of nodes, jenkins puts brackets around master
            if self.node == 'master':
                self.node = '(master)'
            node = server.get_node_info(self.node)
        except jenkins.NotFoundException:
            raise CriticalError("node '{0}' not found, did you specify the correct name? See --list to see nodes"\
                                .format(self.node))
        except jenkins.JenkinsException as _:
            raise CriticalError(_)

        query_time = time.time() - start_time
        if log.isEnabledFor(logging.DEBUG):
            log.debug('%s', jsonpp(node))
        offline = node['offline']
        offline_reason = node['offlineCauseReason']
        num_executors = node['numExecutors']
        num_executors = int(num_executors)
        if not isInt(num_executors):
            raise UnknownError('numExecutors returned non-integer! {0}'.format(support_msg_api()))
        if offline:
            self.critical()
            self.msg += 'offline: {0}'.format(offline_reason)
        else:
            self.msg += 'online'
        self.msg += ', num executors = {0}'.format(num_executors)
        self.check_thresholds(num_executors)
        self.msg += ' | num_executors={0:d}'.format(num_executors)
        self.msg += self.get_perf_thresholds(boundary='lower')
        self.msg += ' query_time={0:.4f}s'.format(query_time)
Exemple #59
0
 def parse(content):
     try:
         _ = json.loads(content)
         if log.isEnabledFor(logging.DEBUG):
             log.debug(jsonpp(_))
         compaction_queue_size = None
         for bean in _['beans']:
             if bean['name'] == 'Hadoop:service=HBase,name=RegionServer,sub=Server':
                 if log.isEnabledFor(logging.DEBUG):
                     log.debug('found RegionServer section:')
                     log.debug(jsonpp(bean))
                 compaction_queue_size = bean['compactionQueueLength']
                 if not isInt(compaction_queue_size):
                     qquit('UNKNOWN', 'non-integer returned for compactionQueueLength! ' + support_msg_api())
                 return compaction_queue_size
     except ValueError as _:
         qquit('UNKNOWN', _ + ': failed to parse HBase Master jmx info. ' + support_msg_api())
     qquit('UNKNOWN', 'RegionServer mbean not found, double check this is pointing to an HBase RegionServer')