def req(self, url, method='post', body=None): assert isStr(method) log.debug('%s %s', method.upper(), url) headers = {"Content-Type": "application/json", "Accept": "application/json", "JSESSIONID": self.jsessionid} log.debug('headers: %s', headers) start_time = time.time() try: req = getattr(requests, method.lower())(url, #cookies=self.jar, data=body, headers=headers) for cookie_tuple in req.cookies.items(): if cookie_tuple[0] == 'JSESSIONID': self.jsessionid = cookie_tuple[1].rstrip('/') timing = time.time() - start_time except requests.exceptions.RequestException as _: qquit('CRITICAL', _) if log.isEnabledFor(logging.DEBUG): log.debug("response: %s %s", req.status_code, req.reason) content = req.content try: content = jsonpp(req.content).strip() except ValueError: pass log.debug("content:\n%s\n%s\n%s", '='*80, content, '='*80) if req.status_code != 200: info = '' try: info = ': {0}'.format(json.loads(req.content)['result']) except (KeyError, ValueError): pass qquit('CRITICAL', "%s %s%s" % (req.status_code, req.reason, info)) return (req, timing)
def parse_host_name(self, item): # pylint: disable=no-self-use if isStr(item): item = json.loads(item) try: return item['Hosts']['host_name'] except KeyError as _: qquit('CRITICAL', 'failed to parse Ambari host name: %s' % _)
def usage(self, msg='', status='UNKNOWN'): if msg: print('%s\n' % msg) else: print(self.usagemsg) self.__parser.print_help() qquit(status)
def parse(self, content): # could also collect lines after 'Regions-in-transition' if parsing /dump # sample: # hbase:meta,,1.1588230740 state=PENDING_OPEN, \ # ts=Tue Nov 24 08:26:45 UTC 2015 (1098s ago), server=amb2.service.consul,16020,1448353564099 soup = BeautifulSoup(content, 'html.parser') #if log.isEnabledFor(logging.DEBUG): # log.debug("BeautifulSoup prettified:\n%s\n%s", soup.prettify(), '='*80) # looks like HMaster UI doesn't print this section if there are no regions in transition, must assume zero regions_stuck_in_transition = 0 try: headings = soup.findAll('h2') for heading in headings: log.debug("checking heading '%s'", heading) if heading.get_text() == "Regions in Transition": log.debug('found Regions in Transition section header') table = heading.find_next('table') log.debug('checking first following table') regions_stuck_in_transition = self.parse_table(table) if not isInt(regions_stuck_in_transition): qquit('UNKNOWN', 'parse error - ' + 'got non-integer \'{0}\' for regions stuck in transition when parsing HMaster UI'\ .format(regions_stuck_in_transition)) return regions_stuck_in_transition #qquit('UNKNOWN', 'parse error - failed to find table data for regions stuck in transition') except (AttributeError, TypeError): qquit('UNKNOWN', 'failed to parse HBase Master UI status page. ' + support_msg())
def send_blueprint_file(self, filename, name=''): # log.debug('send_blueprint_file(%s, %s)' % (filename, name)) validate_file(filename, 'blueprint', nolog=True) try: _ = open(str(filename)) file_data = _.read() except IOError as _: err = "failed to read Ambari Blueprint from file '%s': %s" % (file, _) # log.critical(err) qquit('CRITICAL', err) if not name: try: name = self.parse_blueprint_name(file_data) log.info("name not specified, determined blueprint name from file contents as '%s'" % name) except KeyError as _: pass if not name: name = os.path.splitext(os.path.basename(file))[0] log.info("name not specified and couldn't determine blueprint name from blueprint data, reverting to using filename without extension '%s'" % name) # pylint: disable=line-too-long # this solves the issue of having duplicate Blueprint.blueprint_name keys try: json_data = json.loads(file_data) json_data['Blueprints']['blueprint_name'] = name data = json.dumps(json_data) log.info("reset blueprint field name to '%s'" % name) except ValueError as _: qquit('CRITICAL', "invalid json found in file '%s': %s" % (file, name)) except KeyError as _: log.warn('failed to reset the Blueprint name: %s' % _) return self.send_blueprint(name, data)
def create_cluster(self, cluster, filename, blueprint=''): # log.debug('create_cluster(%s, %s)' % (filename, name)) validate_file(filename, 'cluster hosts mapping', nolog=True) try: _ = open(str(filename)) file_data = _.read() except IOError as _: err = "failed to read Ambari cluster host mapping from file '%s': %s" % (filename, _) # log.critical(err) qquit('CRITICAL', err) log.info("creating cluster '%s' using file '%s'" % (cluster, filename)) if not isJson(file_data): qquit('CRITICAL', "invalid json found in file '%s'" % filename) # don't have access to a blueprint name to enforce reset here # json_data = json.loads(file_data) # try: # json_data['Blueprints']['blueprint_name'] = blueprint # except KeyError, e: # qquit('CRITICAL', 'failed to (re)set blueprint name in cluster/hostmapping data before creating cluster') if blueprint: try: log.info("setting blueprint in cluster creation to '%s'" % blueprint) json_data = json.loads(file_data) json_data['blueprint'] = blueprint file_data = json.dumps(json_data) except KeyError as _: log.warn("failed to inject blueprint name '%s' in to cluster creation" % blueprint) response = self.send('clusters/%s' % cluster, file_data) log.info("Cluster creation submitted, see Ambari web UI to track progress") return response
def run(self): self.no_args() host = self.get_opt('host') port = self.get_opt('port') table = self.get_opt('table') validate_host(host) validate_port(port) validate_database_tablename(table) # raises 500 error if table doesn't exist url = 'http://%(host)s:%(port)s/table.jsp?name=%(table)s' % locals() log.debug('GET %s', url) try: req = requests.get(url) except requests.exceptions.RequestException as _: qquit('CRITICAL', _) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80) if req.status_code != 200: info = '' #if req.status_code == '500' and 'TableNotFoundException' in req.content: if 'TableNotFoundException' in req.content: info = 'table not found' qquit('CRITICAL', "%s %s %s" % (req.status_code, req.reason, info)) is_table_compacting = self.parse_is_table_compacting(req.content) self.msg = 'HBase table \'{0}\' '.format(table) if is_table_compacting: self.warning() self.msg += 'has compaction in progress' else: self.msg += 'has no compaction in progress'
def parse(self, content): # could also collect lines after 'Regions-in-transition' if parsing /dump # sample: # hbase:meta,,1.1588230740 state=PENDING_OPEN, \ # ts=Tue Nov 24 08:26:45 UTC 2015 (1098s ago), server=amb2.service.consul,16020,1448353564099 soup = BeautifulSoup(content, 'html.parser') #if log.isEnabledFor(logging.DEBUG): # log.debug("BeautifulSoup prettified:\n%s\n%s", soup.prettify(), '='*80) # looks like HMaster UI doesn't print this section if there are no regions in transition, must assume zero longest_rit_time = None try: headings = soup.findAll('h2') for heading in headings: log.debug("checking heading '%s'", heading) if heading.get_text() == "Regions in Transition": log.debug('found Regions in Transition section header') table = heading.find_next('table') log.debug('checking first following table') rows = table.findChildren('tr') header_cols = rows[0].findChildren('th') self.assert_headers(header_cols) longest_rit_time = self.process_rows(rows) return longest_rit_time except (AttributeError, TypeError): qquit('UNKNOWN', 'failed to parse HBase Master UI status page. %s' % support_msg())
def parse_version(self, soup): version = None try: attributes_table = soup.find('table', {'id':'attributes_table'}) rows = attributes_table.findAll('tr') num_rows = len(rows) self.sanity_check(num_rows > 5, 'too few rows ({0})'.format(num_rows)) headers = rows[0].findAll('th') num_headers = len(headers) self.sanity_check(num_headers > 2, 'too few header columns ({0})'.format(num_headers)) self.sanity_check(headers[0].text.strip() == 'Attribute Name', 'header first column does not match expected \'Attribute Name\'') self.sanity_check(headers[1].text.strip() == 'Value', 'header second column does not match expected \'Value\'') for row in rows: cols = row.findAll('td') num_cols = len(cols) if num_cols == 0: continue self.sanity_check(num_cols > 2, 'too few columns ({0})'.format(num_cols)) if cols[0].text.strip() == 'HBase Version': version = cols[1].text.split(',')[0] break except (AttributeError, TypeError): qquit('UNKNOWN', 'failed to find parse HBase output. {0}\n{1}'\ .format(support_msg(), traceback.format_exc())) # strip things like -hadoop2 at end version = version.split('-')[0] return version
def get_tables(self): try: tables = self.conn.tables() if not isList(tables): qquit('UNKNOWN', 'table list returned is not a list! ' + support_msg_api()) except (socket.error, socket.timeout, ThriftException, HBaseIOError) as _: qquit('CRITICAL', 'error while trying to get table list: {0}'.format(_))
def run(self): self.no_args() host = self.get_opt('host') port = self.get_opt('port') validate_host(host) validate_port(port) url = 'http://%(host)s:%(port)s/jmx' % locals() log.debug('GET %s', url) try: req = requests.get(url) except requests.exceptions.RequestException as _: qquit('CRITICAL', _) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80) if req.status_code != 200: qquit('CRITICAL', "%s %s" % (req.status_code, req.reason)) compaction_queue_size = self.parse(req.content) self.msg = 'HBase RegionServer compaction ' if compaction_queue_size > 0: self.warning() self.msg += 'in progress' else: self.msg += 'not in progress' self.msg += ', compactionQueueSize = {0}'.format(compaction_queue_size) self.msg += ' | compactionQueueSize={0};0;0'.format(compaction_queue_size)
def assert_headers(header_cols): try: assert header_cols[0].get_text().strip() == 'Region' assert header_cols[1].get_text().strip() == 'State' assert header_cols[2].get_text().strip() == 'RIT time (ms)' except AssertionError as _: qquit('UNKNOWN', 'parsing failed, headers did not match expected - {0}'.format(_))
def connection(self, host, port, user, password, ssl=False, **kwargs): # must set X-Requested-By in newer versions of Ambari self.x_requested_by = user if user == 'admin': self.x_requested_by = os.getenv('USER', user) #log.info("contacting Ambari as '%s'" % self.user) if not isHost(host) or not isPort(port) or not isUser(user) or not password: raise InvalidOptionException('invalid options passed to AmbariBlueprint()') proto = 'http' # pylint: disable=unused-variable if ssl: proto = 'https' self.host = host self.port = port self.user = user self.password = password # if kwargs.has_key('strip_config') and kwargs['strip_config']: if 'strip_config' in kwargs and kwargs['strip_config']: self.strip_config = True self.url_base = '%(proto)s://%(host)s:%(port)s/api/v1' % locals() if 'dir' in kwargs and kwargs['dir']: self.blueprint_dir = kwargs['dir'] if not isDirname(self.blueprint_dir): qquit('UNKNOWN', 'invalid dir arg passed to AmbariBlueprintTool') try: if not self.blueprint_dir or not os.path.exists(self.blueprint_dir): log.info("creating blueprint data dir '%s'" % self.blueprint_dir) os.mkdir(self.blueprint_dir) if not os.path.isdir(self.blueprint_dir): raise IOError("blueprint dir '%s'already taken and is not a directory" % self.blueprint_dir) except IOError as _: die("'failed to create dir '%s': %s" % (self.blueprint_dir, _))
def run(self): self.no_args() host = self.get_opt('host') port = self.get_opt('port') validate_host(host) validate_port(port) self.validate_thresholds(integer=False) url = 'http://%(host)s:%(port)s/master-status' % locals() log.debug('GET %s', url) try: req = requests.get(url) except requests.exceptions.RequestException as _: qquit('CRITICAL', _) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80) if req.status_code != 200: qquit('CRITICAL', ("%s %s" % (req.status_code, req.reason))) self.parse_output(req.content) log.info('server with min regions = %s regions on %s', self.server_min_regions[1], self.server_min_regions[0]) log.info('server with max regions = %s regions on %s', self.server_max_regions[1], self.server_max_regions[0]) imbalance = self.calculate_imbalance() self.msg = '{0}% region imbalance'.format(imbalance) self.check_thresholds(imbalance) self.msg += ' between HBase RegionServers hosting the most vs least number of regions' self.msg += ' (min = {0}, max = {1})'.format(self.server_min_regions[1], self.server_max_regions[1]) self.msg += " | '% region imbalance'={0}%".format(imbalance) self.msg += self.get_perf_thresholds() self.msg += ' min_regions={0} max_regions={1}'.format(self.server_min_regions[1], self.server_max_regions[1])
def connect(self): log.info('connecting to HBase Thrift Server at %s:%s', self.host, self.port) try: # cast port to int to avoid low level socket module TypeError for ports > 32000 self.conn = happybase.Connection(host=self.host, port=int(self.port), timeout=10 * 1000) # ms except (socket.error, socket.timeout, ThriftException, HBaseIOError) as _: qquit('CRITICAL', 'error connecting: {0}'.format(_))
def process_rows(rows): longest_rit_time = None # will skip header anyway when it doesn't find td (will contain th instead) # this will avoid accidentally skipping a row later if the input changes to rows[1:] instead of rows #for row in rows[1:]: for row in rows: print(row) cols = row.findChildren('td') # Regions in Transition rows only have 2 cols # <hex> region rows have Region, State, RIT time (ms) num_cols = len(cols) if num_cols == 0: # header row continue elif num_cols != 3: qquit('UNKNOWN', 'unexpected number of columns ({0}) '.format(num_cols) + 'for regions in transition table. ' + support_msg()) if 'Regions in Transition' in cols[0].get_text(): continue rit_time = cols[2].get_text().strip() if not isInt(rit_time): qquit('UNKNOWN', 'parsing failed, got region in transition time of ' + "'{0}', expected integer".format(rit_time)) rit_time = int(rit_time) if rit_time > longest_rit_time: longest_rit_time = rit_time return longest_rit_time
def main(self): try: # Python 2.x super(NagiosPlugin, self).main() # Python 3.x # super().__init__() # redirect_stderr_stdout() except CriticalError as _: qquit('CRITICAL', _) except WarningError as _: qquit('WARNING', _) except UnknownError as _: qquit('UNKNOWN', _) except CodingError as _: qquit('UNKNOWN', 'Programming Error: {0}. {1}'.format(_, support_msg())) except Exception as _: # pylint: disable=broad-except exception_type = type(_).__name__ if log.isEnabledFor(logging.DEBUG): log.debug("exception: '%s'", exception_type) log.debug(traceback.format_exc()) msg = 'Nagios Plugin Exception: {exception_type}: {msg}'.format(exception_type=exception_type, msg=self.exception_msg()) #msg = ', '.join([x.strip() for x in msg.split('\n')]) # ', ' doesn't look nice for ':\n ...' => ':, ...' (snakebite OutOfNNException) #msg = '\t'.join([x.strip() for x in msg.split('\n')]) #if self.options.verbose > 2: # msg = type(_).__name__ + ': ' + msg msg += '. ' + support_msg() qquit('UNKNOWN', msg)
def get_ingestions(self, num=None, filter_opts=None): log.info('getting ingestion history') if num: chunk_size = num log.info('explicit number of results requested: %s', chunk_size) elif filter_opts: chunk_size = 10 log.info('filters detected, defaulting number of results to %s', chunk_size) else: chunk_size = 100 log.info('using catch all default result limit of %s', chunk_size) settings = {'chunkSize': chunk_size, 'currentPage': 1} if filter_opts is not None: if not isDict(filter_opts): code_error('passed non-dictionary for filter opts to get_ingestions') for key, value in sorted(filter_opts.items()): log.info("filter: '%s' = '%s'", key, value) settings = merge_dicts(settings, filter_opts) log.info('settings: %s', settings) log.info('querying Zaloni for ingestion history') (req, self.query_time) = self.req(url='{url_base}/ingestion/publish/getFileIndex' .format(url_base=self.url_base), # orders by newest first, but seems to return last 10 anyway body=json.dumps(settings)) try: log.info('parsing JSON response') json_dict = json.loads(req.content) except ValueError as _: qquit('UNKNOWN', 'error parsing json returned by Zaloni: {0}'.format(_)) return json_dict
def run(self): log.info("querying %s", self.software) url = "{protocol}://{host}:{port}/PolicyManagement/{api_version}/deployments".format( host=self.host, port=self.port, api_version=self.api_version, protocol=self.protocol ) log.debug("GET %s", url) try: req = requests.get(url, auth=HTTPBasicAuth(self.user, self.password)) except requests.exceptions.RequestException as _: errhint = "" if "BadStatusLine" in str(_.message): errhint = " (possibly connecting to an SSL secured port without using --ssl?)" elif self.protocol == "https" and "unknown protocol" in str(_.message): errhint = " (possibly connecting to a plain HTTP port with the -S / --ssl switch enabled?)" qquit("CRITICAL", str(_) + errhint) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", "=" * 80, req.content.strip(), "=" * 80) if req.status_code == 400 and req.reason == "Bad Request": qquit( "CRITICAL", "{0}: {1} (possibly new install with no deployments yet?)".format(req.status_code, req.reason), ) if req.status_code != 200: qquit("CRITICAL", "{0}: {1}".format(req.status_code, req.reason)) try: json_list = json.loads(req.content) if log.isEnabledFor(logging.DEBUG): print(jsonpp(json_list)) print("=" * 80) if not isList(json_list): raise ValueError("returned content is not a list") if not json_list: qquit("UNKNOWN", "no deployments found") last_deployment = json_list[0] userid = last_deployment["UserId"] description = last_deployment["Description"] hostname = last_deployment["HostName"] timestamp = last_deployment["timestamp"] last_deploy_datetime = datetime.strptime(timestamp, "%b %d, %Y %H:%M:%S %p") except (KeyError, ValueError) as _: qquit( "UNKNOWN", "error parsing output from {software}: {exception}: {error}. {support_msg}".format( software=self.software, exception=type(_).__name__, error=_, support_msg=support_msg_api() ), ) timedelta = datetime.now() - last_deploy_datetime mins = int(int(timedelta.total_seconds()) / 60) self.msg = "{software} last deployment was at '{timestamp}', {mins} mins ago".format( software=self.software, timestamp=timestamp, mins=mins ) self.check_thresholds(mins) if self.verbose: self.msg += " by user '{userid}', host = '{hostname}', description = '{description}'".format( userid=userid, hostname=hostname, description=description ) self.msg += " | mins_since_last_deployment={mins}{thresholds}".format( mins=mins, thresholds=self.get_perf_thresholds(boundary="lower") )
def __end__(self): super(NagiosPlugin, self).__end__() # enabling this would break existing PNP4Nagios data due to the change in num perfdata fields #if '|' not in self.msg: # self.msg += ' |' #self.msg += ' check_time={0:.2f}s'.format(CLI.__total_plugin_time) log.info('end\n%s\n', '='*80) qquit(self.status, self.msg)
def timeout_handler(self, signum, frame): # pylint: disable=unused-argument # problem with this is that it'll print and then the exit exception will be caught and quit() printed again # raising a custom TimeoutException will need to be handled in main, but that would also likely print and be # re-caught and re-printed by NagiosPlugin #print('self timed out after %d second%s' % (self.timeout, plural(self.timeout))) #sys.exit(ERRORS['UNKNOWN']) # if doing die the same thing same will happen since die is a custom func which prints and then calls exit, # only exit would be caught qquit('UNKNOWN', 'self timed out after %d second%s' % (self.timeout, plural(self.timeout)))
def save_all(self): log.info('finding all blueprints and clusters to blueprint') blueprints = self.get_blueprints() clusters = self.get_clusters() if not blueprints and not clusters: qquit('UNKNOWN', 'no Ambari Blueprints or Clusters found on server') for blueprint in blueprints: self.save_blueprint(blueprint) for cluster in clusters: self.save_cluster(cluster)
def get_timedelta(ingestion_date): ingestion_date = str(ingestion_date).strip() invalid_ingestion_dates = ('', 'null', 'None', None) if ingestion_date not in invalid_ingestion_dates: try: # parsing the date will break notifying us if the API format changes in future # whereas if millis changes to secs or similar we could be way off ingestion_datetime = datetime.strptime(ingestion_date, '%Y-%m-%d %H:%M:%S.%f') except ValueError as _: qquit('UNKNOWN', 'error parsing ingestion date time format: {0}'.format(_)) time_delta = datetime.now() - ingestion_datetime return time_delta
def end(self): if self.node_count is None: raise UnknownError('node count is not set!') self.msg = '{0} {1}{2} {3}'.format(self.node_count, self.agent_name, plural(self.node_count), self.state) self.check_thresholds(self.node_count) if self.additional_info: self.msg += ', {0}'.format(self.additional_info) self.msg += ' | {0}s_{1}={2:d}s{3}'.format(self.agent_name, self.state, self.node_count, self.get_perf_thresholds()) if self.additional_perfdata: self.msg += ' {0}'.format(self.additional_perfdata) qquit(self.status, self.msg)
def list(self, url_suffix): self.url = self.url_base + '/' + url_suffix try: response = self.get(url_suffix) except requests.exceptions.RequestException as _: err = 'failed to fetch list of Ambari Blueprints: %s' % _ # log.critical(err) qquit('CRITICAL', err) json_data = json.loads(response) if log.isEnabledFor(logging.DEBUG): log.debug("json_data = " + jsonpp(json_data)) return json_data
def check_table(self): log.info('checking table \'%s\'', self.table) if not self.conn.is_table_enabled(self.table): qquit('CRITICAL', "table '{0}' is disabled!".format(self.table)) table_conn = self.conn.table(self.table) families = table_conn.families() self.num_column_families = len(families) log.info('found %s column families: %s', self.num_column_families, families) for column_family in sorted(families): column = '{0}:{1}'.format(column_family, self.column_qualifier) self.check_write(table_conn, self.row, column) self.check_read(table_conn, self.row, column, self.value) self.check_delete(table_conn, self.row, column)
def run(self): try: if self.get_opt('list_metrics'): self.list_metrics() json_struct = self.get('lastdata', params={'metrics': self.metrics}) metrics = self.parse_metrics(json_struct) self.msg_metrics(metrics) except (KeyError, ValueError) as _: qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\ .format(software=self.software, exception=type(_).__name__, error=_, support_msg=support_msg_api()))
def run(self): self.no_args() host = self.options.host port = self.options.port validate_host(host) validate_port(port) url = 'http://%(host)s:%(port)s/oozie/v1/admin/status' % locals() log.debug('GET %s' % url) try: req = requests.get(url) except requests.exceptions.RequestException as _: qquit('CRITICAL', _) log.debug("response: %s %s" % (req.status_code, req.reason)) log.debug("content: '%s'" % req.content) if req.status_code != 200: qquit('CRITICAL', "Non-200 response! %s %s" % (req.status_code, req.reason)) # should == NORMAL if not isJson(req.content): qquit('UNKNOWN', 'non-JSON returned by Oozie server at {0}:{1}'.format(host, port)) status = None try: _ = json.loads(req.content) status = _['systemMode'] except KeyError: qquit('UNKNOWN', 'systemMode key was not returned in output from Oozie. {0}'.format(support_msg_api())) self.msg = 'Oozie status = {0}'.format(status) if status == 'NORMAL': self.ok() else: self.critical()
def run(self): self.no_args() host = self.get_opt('host') port = self.get_opt('port') validate_host(host) validate_port(port) # observed bug in HDP 2.3 (HBase 1.1.2) where the JMX metric from HMaster UI /jmx is displaying 0 for beans # [ {"name":"Hadoop:service=HBase,name=Master,sub=AssignmentManger", ..., "ritCountOverThreshold" : 0 } # https://issues.apache.org/jira/browse/HBASE-16636 #url = 'http://%(host)s:%(port)s/jmx' % locals() # could get info from flat txt debug page but it doesn't contain the summary count #url = 'http://%(host)s:%(port)s/dump' % locals() url = 'http://%(host)s:%(port)s/master-status' % locals() log.debug('GET %s', url) try: req = requests.get(url) except requests.exceptions.RequestException as _: qquit('CRITICAL', _) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80) if req.status_code != 200: qquit('CRITICAL', "%s %s" % (req.status_code, req.reason)) regions_stuck_in_transition = self.parse(req.content) if regions_stuck_in_transition is None: qquit('UNKNOWN', 'parse error - failed to find number for regions stuck in transition') if not isInt(regions_stuck_in_transition): qquit('UNKNOWN', 'parse error - got non-integer for regions stuck in transition when parsing HMaster UI') if regions_stuck_in_transition == 0: self.ok() else: self.critical() self.msg = '{0} regions stuck in transition (ie. transitioning longer than HBase threshold)'\ .format(regions_stuck_in_transition) self.msg += " | regions_stuck_in_transition={0};0;0".format(regions_stuck_in_transition)
def parse(self, req): soup = BeautifulSoup(req.content, 'html.parser') # if log.isEnabledFor(logging.DEBUG): # log.debug("BeautifulSoup prettified:\n%s\n%s", soup.prettify(), '='*80) status = None try: status = soup.find('div', {'class': 'alert alert-success'}).get_text().strip() except (AttributeError, TypeError): qquit('UNKNOWN', 'failed to parse Apache Drill status page. %s' % support_msg()) if re.match('Running!?$', status): self.ok() else: self.critical() return status
def run(self): self.no_args() host = self.get_opt('host') port = self.get_opt('port') validate_host(host) validate_port(port) self.validate_thresholds() # observed bug in HDP 2.3 (HBase 1.1.2) where the JMX metric from HMaster UI /jmx is displaying 0 for beans # [ {"name":"Hadoop:service=HBase,name=Master,sub=AssignmentManger", ..., "ritCountOverThreshold" : 0 } # https://issues.apache.org/jira/browse/HBASE-16636 #url = 'http://%(host)s:%(port)s/jmx' % locals() # could get info from flat txt debug page but it doesn't contain the summary count #url = 'http://%(host)s:%(port)s/dump' % locals() url = 'http://%(host)s:%(port)s/master-status' % locals() log.debug('GET %s', url) try: req = requests.get(url) except requests.exceptions.RequestException as _: qquit('CRITICAL', _) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", '=' * 80, req.content.strip(), '=' * 80) if req.status_code != 200: qquit('CRITICAL', "%s %s" % (req.status_code, req.reason)) regions_in_transition = self.parse(req.content) if regions_in_transition is None: qquit( 'UNKNOWN', 'parse error - failed to find number for regions in transition' ) if not isInt(regions_in_transition): qquit( 'UNKNOWN', 'parse error - got non-integer for regions in transition when parsing HMaster UI' ) if regions_in_transition == 0: self.ok() else: self.critical() self.msg = '{0} regions in transition'\ .format(regions_in_transition) self.check_thresholds(regions_in_transition) self.msg += " | regions_in_transition={0}".format( regions_in_transition) self.msg += self.get_perf_thresholds()
def check_ingestion(self, num, filter_opts=None, max_age=None, max_runtime=None): log.info('checking ingestion history') json_dict = self.get_ingestions(num, filter_opts) info = '' if self.verbose: for key in sorted(filter_opts): info += " {0}='{1}'".format(key, filter_opts[key]) try: results = json_dict['result'] if not results: qquit('CRITICAL', "no results found for ingestion{0}"\ .format('{0}. {1}'.format(info, self.extract_response_message(json_dict)) + \ 'Perhaps you specified incorrect filters? Use --list to see existing ingestions')) num_results = len(results) log.info('%s ingestion history results returned', num_results) self.check_statuses(results) if num: self.msg += ' out of last {0} ingest{1}'.format(num_results, plural(num_results)) if self.history_mins: self.msg += ' within last {0} ({1} min{2})'.format(sec2human(self.history_mins * 60), str(self.history_mins).rstrip('0').rstrip('.'), plural(self.history_mins)) longest_incomplete_timedelta = self.check_longest_incomplete_ingest(results, max_runtime) age_timedelta_secs = self.check_last_ingest_age(results, max_age=max_age) self.msg_filter_details(filter_opts=filter_opts) self.msg += ' |' self.msg += ' last_ingest_age={0}s;{1}'.format(age_timedelta_secs, max_age * 3600 if max_age else '') self.msg += ' longest_incomplete_ingest_age={0}s;{1}'\ .format(self.timedelta_seconds(longest_incomplete_timedelta) if longest_incomplete_timedelta else 0, max_age * 3600 if max_age else '') self.msg += ' auth_time={auth_time}s query_time={query_time}s'.format(auth_time=self.auth_time, query_time=self.query_time) except KeyError as _: qquit('UNKNOWN', 'error parsing workflow execution history: {0}'.format(_))
def fetch(self, url_suffix): err = '' try: response = self.get(url_suffix) except requests.exceptions.RequestException as _: err = "failed to fetch Ambari Blueprint from '%s': %s" % (self.url, _) # log.critical(err) qquit('CRITICAL', err) json_data = json.loads(response) if log.isEnabledFor(logging.DEBUG): log.debug("blueprint = " + jsonpp(json_data)) try: del json_data['href'] log.debug("stripped href as it's not valid if re-submitting the blueprint to Ambari") except KeyError as _: pass # Ambari 2.1.3 supports this according to: # https://cwiki.apache.org/confluence/display/AMBARI/Blueprints#Blueprints-ClusterCreationTemplateStructure # json_data['config_recommendation_strategy'] = 'NEVER_APPLY' # default # json_data['config_recommendation_strategy'] = 'ONLY_STACK_DEFAULTS_APPLY' # json_data['config_recommendation_strategy'] = 'ALWAYS_APPLY' if self.strip_config: log.info('stripping out config sections of blueprints to make more generic') try: del json_data['configurations'] for hostgroup in json_data['host_groups']: del hostgroup['configurations'] except KeyError as _: pass try: json_data['host_groups'] = list_sort_dicts_by_value(json_data['host_groups'], 'name') for hostgroup in json_data['host_groups']: hostgroup['components'] = list_sort_dicts_by_value(hostgroup['components'], 'name') except KeyError as _: qquit('CRITICAL', 'failed to sort blueprint: %s' % _) return jsonpp(json_data)
def get_latest_build(self, content): build = None builds = json.loads(content) if not builds: qquit( 'UNKNOWN', "no Travis CI builds returned by the Travis API." + " Either the specified repo '{0}' doesn't exist".format( self.repo) + " or no builds have happened yet?" + " Also remember the repo is case sensitive, for example 'harisekhon/nagios-plugins' returns this" + " blank build set whereas 'HariSekhon/nagios-plugins' succeeds" + " in returning latest builds information") # get latest finished build for _ in builds: if _['state'] == 'finished': if build is None: build = _ else: self.builds_in_progress += 1 if build is None: qquit('UNKNOWN', 'no recent builds finished yet') if log.isEnabledFor(logging.DEBUG): log.debug("latest build:\n{0}".format(jsonpp(build))) return build
def main(self): # log.debug('running main()') log.setLevel(logging.WARN) self.setup() try: self.add_options() self.add_default_opts() except InvalidOptionException as _: self.usage(_) try: self.__parse_args__() # broken # autoflush() # too late # os.environ['PYTHONUNBUFFERED'] = "anything" self.verbose = self.get_opt('verbose') if self.is_option_defined('quiet') and self.get_opt('quiet'): self.verbose = 0 elif self.verbose > 2: log.setLevel(logging.DEBUG) elif self.verbose > 1: log.setLevel(logging.INFO) elif self.verbose > 0 and self._prog[0:6] != 'check_': log.setLevel(logging.WARN) if self.options.debug: log.setLevel(logging.DEBUG) # pragma: no cover log.debug('enabling debug logging') if self.verbose < 3: self.verbose = 3 log.info('Hari Sekhon %s', self.version) log.info(self._github_repo) log.info('verbose level: %s (%s)', self.verbose, logging.getLevelName(log.getEffectiveLevel())) if self.timeout is not None: validate_int(self.timeout, 'timeout', 0, self.timeout_max) log.debug('setting timeout alarm (%s)', self.timeout) signal.signal(signal.SIGALRM, self.timeout_handler) signal.alarm(int(self.timeout)) # if self.options.version: # print(self.version) # sys.exit(ERRORS['UNKNOWN']) self.process_options() self.process_args() try: self.run() except CriticalError as _: qquit('CRITICAL', _) except WarningError as _: qquit('WARNING', _) except UnknownError as _: qquit('UNKNOWN', _) self.__end__() except InvalidOptionException as _: self.usage(_) # pragma: no cover except KeyboardInterrupt: # log.debug('Caught control-c...') print('Caught control-c...') # pragma: no cover
def parse_output(self, content): soup = BeautifulSoup(content, 'html.parser') if log.isEnabledFor(logging.DEBUG): log.debug("BeautifulSoup prettified:\n{0}\n{1}".format(soup.prettify(), '='*80)) # shorter to just catch NoneType attribute error when tag not found and returns None try: basestats = soup.find('div', {'id': 'tab_baseStats'}) table = basestats.find('table') #for table in basestats: rows = table.findAll('tr') headers = rows[0].findAll('th') header_server = headers[0].text # HBase 1.1 in HDP 2.3: ServerName | Start time | Requests Per Second | Num. Regions # HBase 1.2 (Apache): ServerName | Start time | Version | Requests per Second | Num. Regions # HBase 1.4 (Apache): ServerName | Start time | Last Contact | Version | Requests Per Second | Num. Regions num_regions_index = len(headers) - 1 header_num_regions = headers[num_regions_index].text if header_server != 'ServerName': qquit('UNKNOWN', "Table headers in Master UI have changed" + " (got {0}, expected 'ServerName'). ".format(header_server) + support_msg()) if header_num_regions != 'Num. Regions': qquit('UNKNOWN', "Table headers in Master UI have changed" + " (got {0}, expected 'Num. Regions'). ".format(header_num_regions) + support_msg()) log.debug('%-50s\tnum_regions', 'server') for row in rows[1:]: # this can be something like: # 21689588ba40,16201,1473775984259 # so don't apply isHost() validation because it'll fail FQDN / IP address checks cols = row.findAll('td') server = cols[0].text if self.total_regex.match(server): continue num_regions = cols[num_regions_index].text if not isInt(num_regions): qquit('UNKNOWN', "parsing error - got '{0}' for num regions".format(num_regions) + " for server '{1}', was expecting integer.".format(server) + " UI format must have changed" + support_msg()) num_regions = int(num_regions) log.debug('%-50s\t%s', server, num_regions) if self.server_min_regions[1] is None or num_regions < self.server_min_regions[1]: self.server_min_regions = (server, num_regions) if self.server_max_regions[1] is None or num_regions > self.server_max_regions[1]: self.server_max_regions = (server, num_regions) except (AttributeError, TypeError, IndexError): qquit('UNKNOWN', 'failed to find parse output')
def run(self): self.no_args() host = self.get_opt('host') port = self.get_opt('port') validate_host(host) validate_port(port) log.info('querying Tachyon Master') url = 'http://%(host)s:%(port)s/workers' % locals() log.debug('GET %s' % url) try: req = requests.get(url) except requests.exceptions.RequestException as _: qquit('CRITICAL', _) log.debug("response: %s %s" % (req.status_code, req.reason)) log.debug("content:\n{0}\n{1}\n{2}".format('=' * 80, req.content.strip(), '=' * 80)) if req.status_code != 200: qquit('CRITICAL', "Non-200 response! %s %s" % (req.status_code, req.reason)) soup = BeautifulSoup(req.content, 'html.parser') dead_workers = 0 try: dead_workers = len([ _ for _ in soup.find(id='data2').find('tbody').find_all('tr') if _ ]) except (AttributeError, TypeError): qquit( 'UNKNOWN', 'failed to find parse Tachyon Master info for dead workers' % self.__dict__) try: dead_workers = int(dead_workers) except (ValueError, TypeError): qquit( 'UNKNOWN', 'Tachyon Master dead workers parsing returned non-integer: {0}' .format(dead_workers)) self.msg = 'Tachyon dead workers = {0}'.format(dead_workers) # pylint: disable=attribute-defined-outside-init self.ok() # TODO: thresholds on number of dead workers (coming soon) if dead_workers: self.critical()
def get_version(self): content = self.get() try: json_list = json.loads(content) if log.isEnabledFor(logging.DEBUG): print(jsonpp(json_list)) print('='*80) if not isList(json_list): raise ValueError("non-list returned by API (is type '{0}')".format(type(json_list))) json_dict = json_list[0] if not isDict(json_dict): raise ValueError("non-dict found inside returned list (is type '{0}')".format(type(json_dict))) company_name = json_dict['company_name'] company_website = json_dict['company_website'] regex = re.compile(r'Blue\s*Talon', re.I) if not regex.match(company_name) and \ not regex.match(company_website): qquit('UNKNOWN', 'Blue Talon name was not found in either company_name or company_website fields' \ + ', are you definitely querying a Blue Talon server?') build_version = json_dict['build_version'] update_date = json_dict['update_date'] api_version = json_dict['api_version'] if not isVersion(api_version): qquit('UNKNOWN', '{0} api version unrecognized \'{1}\'. {2}'\ .format(self.software, api_version, support_msg_api())) if api_version != self.api_version: qquit('UNKNOWN', "unexpected API version '{0}' returned (expected '{1}')"\ .format(api_version, self.api_version)) if self.verbose: extra_info = ' revision {revision} build {build}, schema revision = {schema_revision}'\ .format(revision=json_dict['revision_no'], build=json_dict['build_no'], schema_revision=json_dict['schema_revision']) extra_info += ', api version = {api_version}, update date = {update_date}'\ .format(api_version=api_version, update_date=update_date) else: extra_info = ', update date = {update_date}'.format(update_date=update_date) except (KeyError, ValueError) as _: qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\ .format(software=self.software, exception=type(_).__name__, error=_, support_msg=support_msg_api())) return (build_version, extra_info)
def check_workflow(self, workflow_name, workflow_id): log.info("checking workflow '%s' id '%s'", workflow_name, workflow_id) # GET /workflow/fetchWorkflowStatus/<instance_id> is also available but only uses wfId, doesn't support wfName # returns ['result']['list'] = [ {}, {}, ... ] (req, self.query_time) = self.req( url='{url_base}/workflow/publish/getWorkflowExecutionHistory'. format(url_base=self.url_base), # orders by newest first, but seems to return last 10 anyway body=json.dumps({ 'chunk_size': 1, 'currentPage': 1, 'wfName': workflow_name, 'wfId': workflow_id })) info = '' if workflow_name: info += " name '{0}'".format(workflow_name) if workflow_id: info += " id '{0}'".format(workflow_id) try: json_dict = json.loads(req.content) result = json_dict['result'] not_found_err = '{0}. {1}'.format(info, self.extract_response_message(json_dict)) + \ 'Perhaps you specified the wrong name/id or the workflow hasn\'t run yet? ' + \ 'Use --list to see existing workflows' if result is None: if self._all: return None qquit('CRITICAL', "no results found for workflow{0}".format(not_found_err)) reports = result['jobExecutionReports'] if not isList(reports): raise ValueError('jobExecutionReports is not a list') if not reports: qquit('CRITICAL', "no reports found for workflow{0}".format(not_found_err)) # orders by newest first by default, checking last run only report = self.get_latest_complete_report(reports) status = report['status'] if status == 'SUCCESS': pass elif status == 'INCOMPLETE': self.warning() else: self.critical() self.msg += "workflow '{workflow}' id '{id}' status = '{status}'".format( workflow=report['wfName'], id=report['wfId'], status=status) if not self._all: self.check_times(report['startDate'], report['endDate']) return status except (KeyError, ValueError) as _: qquit('UNKNOWN', 'error parsing workflow execution history: {0}'.format(_))
def run(self): self.no_args() self.host = self.get_opt('host') self.port = self.get_opt('port') self.table = self.get_opt('table') validate_host(self.host) validate_port(self.port) validate_database_tablename(self.table, 'hbase') try: log.info('connecting to HBase Thrift Server at %s:%s', self.host, self.port) # cast port to int to avoid low level socket module TypeError for ports > 32000 self.conn = happybase.Connection(host=self.host, port=int(self.port), timeout=10 * 1000) # ms except (socket.error, socket.timeout, ThriftException, HBaseIOError) as _: qquit('CRITICAL', 'error connecting: {0}'.format(_)) if self.get_opt('list'): tables = self.get_tables() print('HBase Tables:\n\n' + '\n'.join(tables)) sys.exit(ERRORS['UNKNOWN']) log.info('checking table \'%s\'', self.table) is_enabled = None try: is_enabled = self.conn.is_table_enabled(self.table) except HBaseIOError as _: #if 'org.apache.hadoop.hbase.TableNotFoundException' in _.message: if 'TableNotFoundException' in _.message: qquit('CRITICAL', 'table \'{0}\' does not exist'.format(self.table)) else: qquit('CRITICAL', _) except (socket.error, socket.timeout, ThriftException) as _: qquit('CRITICAL', _) if not is_enabled: self.critical() self.msg = 'HBase table \'{0}\' enabled = {1}'.format( self.table, is_enabled) log.info('finished, closing connection') self.conn.close()
def run(self): self.no_args() host = self.get_opt('host') port = self.get_opt('port') validate_host(host) validate_port(port) log.info('querying Tachyon Master') url = 'http://%(host)s:%(port)s/home' % locals() log.debug('GET %s' % url) try: req = requests.get(url) except requests.exceptions.RequestException as _: qquit('CRITICAL', _) log.debug("response: %s %s" % (req.status_code, req.reason)) log.debug("content:\n{0}\n{1}\n{2}".format('=' * 80, req.content.strip(), '=' * 80)) if req.status_code != 200: qquit('CRITICAL', "Non-200 response! %s %s" % (req.status_code, req.reason)) soup = BeautifulSoup(req.content, 'html.parser') try: running_workers = soup.find('th', text=re.compile(r'Running\s+Workers:?', re.I))\ .find_next_sibling().get_text() except (AttributeError, TypeError): qquit( 'UNKNOWN', 'failed to find parse Tachyon Master info for running workers' % self.__dict__) try: running_workers = int(running_workers) except (ValueError, TypeError): qquit( 'UNKNOWN', 'Tachyon Master live workers parsing returned non-integer: {0}' .format(running_workers)) self.msg = 'Tachyon running workers = {0}'.format(running_workers) # pylint: disable=attribute-defined-outside-init self.ok() # TODO: thresholds on number of live workers (coming soon) if running_workers < 1: self.critical()
def run(self): log.info('querying %s', self.software) url = '{protocol}://{host}:{port}/PolicyManagement/{api_version}/resources'\ .format(host=self.host, port=self.port, api_version=self.api_version, protocol=self.protocol) log.debug('GET %s', url) try: req = requests.get(url, auth=HTTPBasicAuth(self.user, self.password)) except requests.exceptions.RequestException as _: errhint = '' if 'BadStatusLine' in str(_.message): errhint = ' (possibly connecting to an SSL secured port without using --ssl?)' elif self.protocol == 'https' and 'unknown protocol' in str(_.message): errhint = ' (possibly connecting to a plain HTTP port with the -S / --ssl switch enabled?)' qquit('CRITICAL', str(_) + errhint) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80) if req.status_code != 200: qquit('CRITICAL', '{0}: {1}'.format(req.status_code, req.reason)) try: json_dict = json.loads(req.content) if log.isEnabledFor(logging.DEBUG): print(jsonpp(json_dict)) print('='*80) if not isDict(json_dict): raise ValueError("non-dict returned by Blue Talon API (got type '{0}')".format(type(json_dict))) resource_domains_list = json_dict['resource_domains'] if not isList(resource_domains_list): raise ValueError("non-list returned for 'resource_domains' key by Blue Talon API (got type '{0}')"\ .format(type(resource_domains_list))) num_resource_domains = len(resource_domains_list) num_resources = 0 for resource_domain in resource_domains_list: resources = resource_domain['resources'] if not isList(resources): raise ValueError("non-list found for resources in resource_domain '{0}' (got type '{1}'"\ .format(resource_domain['resource_domain_name'], type(resources))) num_resources += len(resources) self.msg += '{num_resources} resources'.format(num_resources=num_resources) self.check_thresholds(num_resources) self.msg += ' across {num_resource_domains} resource domains'\ .format(num_resource_domains=num_resource_domains) self.msg += ' | num_resources={num_resources}{perf} num_resource_domains={num_resource_domains}'\ .format(num_resources=num_resources, num_resource_domains=num_resource_domains, perf=self.get_perf_thresholds()) except (KeyError, ValueError) as _: qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\ .format(software=self.software, exception=type(_).__name__, error=_, support_msg=support_msg_api()))
def run(self): self.no_args() host = self.get_opt('host') port = self.get_opt('port') self.table = self.get_opt('table') validate_host(host) validate_port(port) self.validate_thresholds(integer=False) try: log.info('connecting to HBase Thrift Server at %s:%s', host, port) # cast port to int to avoid low level socket module TypeError for ports > 32000 self.conn = happybase.Connection(host=host, port=int(port), timeout=10 * 1000) # ms except (socket.error, socket.timeout, ThriftException, HBaseIOError) as _: qquit('CRITICAL', 'error connecting: {0}'.format(_)) tables = self.conn.tables() if len(tables) < 1: qquit('CRITICAL', 'no HBase tables found!') if self.get_opt('list_tables'): print('Tables:\n\n' + '\n'.join(tables)) sys.exit(ERRORS['UNKNOWN']) if self.table: if self.table not in tables: qquit('CRITICAL', "HBase table '{0}' does not exist!".format(self.table)) self.process_table(self.table) else: for table in tables: self.process_table(table) log.info('finished, closing connection') self.conn.close() imbalance = self.calculate_imbalance() self.msg = '{0}% region imbalance'.format(imbalance) self.check_thresholds(imbalance) self.msg += ' between HBase RegionServers hosting the most vs least number of regions' if self.table: self.msg += " for table '{0}'".format(self.table) else: self.msg += ' across all tables' self.msg += ' (min = {0}, max = {1})'.format( self.server_min_regions[1], self.server_max_regions[1]) self.msg += " | '% region imbalance'={0}%".format(imbalance) self.msg += self.get_perf_thresholds() self.msg += ' min_regions={0} max_regions={1}'.format( self.server_min_regions[1], self.server_max_regions[1])
def run(self): content = self.get() try: json_dict = json.loads(content) if log.isEnabledFor(logging.DEBUG): print(jsonpp(json_dict)) print('='*80) if not isDict(json_dict): raise ValueError('returned content is not a dict') status = json_dict['status'] if status != 'success': qquit('CRITICAL', "request status = '{0}' (expected 'success')".format(status)) status_code = json_dict['statusCode'] if status_code != 200: qquit('CRITICAL', "request status code = '{0}' (expected '200')".format(status_code)) message = json_dict['message'] data = json_dict['data'] if not data: num_endpoints = 0 elif not isList(data): qquit('CRITICAL', 'non-list returned for policy end points data') else: num_endpoints = len(data) match = re.match(message, r'Total [(\d+)] policy engine end point\(s\) found', re.I) if not match: raise ValueError('failed to parse message for confirmation of number of endpoints') message_num_endpoints = int(match.group(1)) if num_endpoints != message_num_endpoints: raise ValueError('num endpoints does not match parsed value from returned message') except (KeyError, ValueError) as _: qquit('UNKNOWN', 'error parsing output from {software}: {exception}: {error}. {support_msg}'\ .format(software=self.software, exception=type(_).__name__, error=_, support_msg=support_msg_api())) self.msg = "{software} number of policy end points = {num_endpoints}"\ .format(software=self.software, num_endpoints=num_endpoints) self.check_thresholds(num_endpoints) self.msg += ' | num_endpoints={num_endpoints}'.format(num_endpoints=num_endpoints) + self.get_perf_thresholds()
def main(self): # DEBUG env var is picked up immediately in pylib utils, do not override it here if so if os.getenv('DEBUG'): log.setLevel(logging.DEBUG) if not log.isEnabledFor(logging.DEBUG) and \ not log.isEnabledFor(logging.ERROR): # do not downgrade logging either log.setLevel(logging.WARN) self.setup() try: self.add_options() self.add_default_opts() except InvalidOptionException as _: self.usage(_) try: self.__parse_args__() # broken # autoflush() # too late # os.environ['PYTHONUNBUFFERED'] = "anything" log.info('Hari Sekhon %s', self.version) log.info(self._github_repo) log.info('verbose level: %s (%s)', self.verbose, logging.getLevelName(log.getEffectiveLevel())) if self.timeout is not None: validate_int(self.timeout, 'timeout', 0, self.timeout_max) log.debug('setting timeout alarm (%s)', self.timeout) signal.signal(signal.SIGALRM, self.timeout_handler) signal.alarm(int(self.timeout)) # if self.options.version: # print(self.version) # sys.exit(ERRORS['UNKNOWN']) self.process_options() self.process_args() try: self.run() except CriticalError as _: qquit('CRITICAL', _) except WarningError as _: qquit('WARNING', _) except UnknownError as _: qquit('UNKNOWN', _) self.__end__() except InvalidOptionException as _: if log.isEnabledFor(logging.DEBUG): log.debug(traceback.format_exc()) self.usage(_) # pragma: no cover except KeyboardInterrupt: # log.debug('Caught control-c...') print('Caught control-c...') # pragma: no cover
def process_table(self, table): try: table_handle = self.conn.table(table) regions = table_handle.regions() if len(regions) < 1: qquit('UNKNOWN', "no regions found for table '{0}'".format(table)) for region in regions: log.debug("table '%s' region '%s'", table, region) server = region['server_name'] self.server_region_counts[server] = self.server_region_counts.get(server, 0) self.server_region_counts[server] += 1 except (socket.error, socket.timeout, ThriftException, HBaseIOError) as _: qquit('CRITICAL', _) except KeyError as _: qquit('UNKNOWN', 'failed to process region information. ' + support_msg_api())
def run(self): self.no_args() host = self.options.host port = self.options.port validate_host(host) validate_port(port) url = 'http://%(host)s:%(port)s/oozie/v1/admin/status' % locals() log.debug('GET %s' % url) try: req = requests.get(url) except requests.exceptions.RequestException as _: qquit('CRITICAL', _) log.debug("response: %s %s" % (req.status_code, req.reason)) log.debug("content: '%s'" % req.content) if req.status_code != 200: qquit('CRITICAL', "Non-200 response! %s %s" % (req.status_code, req.reason)) # should == NORMAL if not isJson(req.content): qquit( 'UNKNOWN', 'non-JSON returned by Oozie server at {0}:{1}'.format( host, port)) status = None try: _ = json.loads(req.content) status = _['systemMode'] except KeyError: qquit( 'UNKNOWN', 'systemMode key was not returned in output from Oozie. {0}'. format(support_msg_api())) self.msg = 'Oozie status = {0}'.format(status) if status == 'NORMAL': self.ok() else: self.critical()
def run(self): initial_start = time.time() try: connect_time = self.connect() if self.list_tables: tables = self.get_tables() print('HBase Tables:\n\n' + '\n'.join(tables)) sys.exit(ERRORS['UNKNOWN']) self.check_table() log.info('finished, closing connection') self.conn.close() except HBaseIOError as _: #if 'org.apache.hadoop.hbase.TableNotFoundException' in _.message: if 'TableNotFoundException' in _.message: qquit('CRITICAL', 'table \'{0}\' does not exist'.format(self.table)) elif 'NoSuchColumnFamilyException' in _.message: qquit('CRITICAL', 'column family \'{0}\' does not exist'.format(self.column)) else: qquit('CRITICAL', _) except (socket.timeout, ThriftException) as _: qquit('CRITICAL', _) total_time = (time.time() - initial_start) * 1000 self.output(connect_time, total_time)
def parse(json_data): try: # it's already nicely layed out #if log.isEnabledFor(logging.DEBUG): # log.debug('%s', jsonpp(json_data)) compaction_queue_size = None for bean in json_data['beans']: if bean['name'] == 'Hadoop:service=HBase,name=RegionServer,sub=Server': if log.isEnabledFor(logging.DEBUG): log.debug('found RegionServer section:') log.debug('%s', jsonpp(bean)) compaction_queue_size = bean['compactionQueueLength'] if not isInt(compaction_queue_size): qquit('UNKNOWN', 'non-integer returned for compactionQueueLength! ' + support_msg_api()) return compaction_queue_size except KeyError as _: qquit('UNKNOWN', _ + ': failed to parse HBase Master jmx info. ' + support_msg_api()) qquit('UNKNOWN', 'RegionServer mbean not found, double check this is pointing to an HBase RegionServer')
def run(self): self.no_args() host = self.get_opt('host') port = self.get_opt('port') validate_host(host) validate_port(port) self.validate_thresholds() # observed bug in HDP 2.3 (HBase 1.1.2) where the JMX metric from HMaster UI /jmx is displaying 0 for # ritOldestAge, despite currently having regions stuck in transition for a large number of ms # [ {"name":"Hadoop:service=HBase,name=Master,sub=AssignmentManger", ..., "ritCountOverThreshold" : 0 } # https://issues.apache.org/jira/browse/HBASE-16636 #url = 'http://%(host)s:%(port)s/jmx' % locals() # could get info from flat txt debug page but it doesn't contain the summary count #url = 'http://%(host)s:%(port)s/dump' % locals() url = 'http://%(host)s:%(port)s/master-status' % locals() log.debug('GET %s', url) try: req = requests.get(url) except requests.exceptions.RequestException as _: qquit('CRITICAL', _) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", '=' * 80, req.content.strip(), '=' * 80) if req.status_code != 200: qquit('CRITICAL', "%s %s" % (req.status_code, req.reason)) longest_rit_time = self.parse(req.content) if longest_rit_time is None: self.msg = 'no regions in transition' elif not isInt(longest_rit_time): qquit( 'UNKNOWN', 'parse error - got non-integer \'{0}\' for '.format( longest_rit_time) + 'longest regions in transition time when parsing HMaster UI') else: longest_rit_time /= 1000.0 self.msg = 'HBase region longest current transition = {0:.2f} secs'.format( longest_rit_time) self.check_thresholds(longest_rit_time) self.msg += ' | longest_region_in_transition={0}'.format( longest_rit_time) self.msg += self.get_perf_thresholds()
def get(self): log.info('querying %s', self.software) url = '{protocol}://{host}:{port}/PolicyManagement/{api_version}/configurations/pdp/end_points'\ .format(host=self.host, port=self.port, api_version=self.api_version, protocol=self.protocol) log.debug('GET %s', url) try: req = requests.get(url, auth=HTTPBasicAuth(self.user, self.password)) except requests.exceptions.RequestException as _: errhint = '' if 'BadStatusLine' in str(_.message): errhint = ' (possibly connecting to an SSL secured port without using --ssl?)' elif self.protocol == 'https' and 'unknown protocol' in str(_.message): errhint = ' (possibly connecting to a plain HTTP port with the -S / --ssl switch enabled?)' qquit('CRITICAL', str(_) + errhint) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80) if req.status_code == 404 and req.reason == 'Not Found': qquit('CRITICAL', '{0}: {1} (no end points?)'.format(req.status_code, req.reason)) if req.status_code != 200: qquit('CRITICAL', '{0}: {1}'.format(req.status_code, req.reason)) return req.content
def get_version(self): log.info('querying %s', self.software) url = 'http://{host}:{port}/home'.format(host=self.host, port=self.port) log.debug('GET %s', url) try: req = requests.get(url) except requests.exceptions.RequestException as _: qquit('CRITICAL', _) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", '='*80, req.content.strip(), '='*80) if req.status_code != 200: qquit('CRITICAL', "{0} {1}".format(req.status_code, req.reason)) soup = BeautifulSoup(req.content, 'html.parser') if log.isEnabledFor(logging.DEBUG): log.debug("BeautifulSoup prettified:\n{0}\n{1}".format(soup.prettify(), '='*80)) try: #version = soup.find('span', {'class': 'version'}).text version = soup.find('span', class_='version').text except (AttributeError, TypeError) as _: qquit('UNKNOWN', 'failed to find parse {0} output. {1}\n{2}'\ .format(self.software, support_msg_api(), traceback.format_exc())) return version
def get_version(self): log.info('querying %s', self.software) url = 'http://{host}:{port}/version'.format(host=self.host, port=self.port) log.debug('GET %s', url) try: req = requests.get(url) except requests.exceptions.RequestException as _: qquit('CRITICAL', _) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n%s\n%s\n%s", '=' * 80, req.content.strip(), '=' * 80) if req.status_code != 200: qquit('CRITICAL', '{0}: {1}'.format(req.status_code, req.reason)) try: json_dict = json.loads(req.content) version = json_dict['etcdserver'] cluster_version = json_dict['etcdcluster'] except KeyError as _: qquit('UNKNOWN', 'error parsing output from {software}: {error}. {support_msg}'\ .format(software=self.software, error=_, support_msg=support_msg_api())) return (version, cluster_version)
def check_workflow(self, workflow_name, workflow_id, max_age=None, max_runtime=None): log.info("checking workflow '%s' id '%s'", workflow_name, workflow_id) (req, self.query_time) = self.req(url='{url_base}/workflow/publish/getWorkflowExecutionHistory' .format(url_base=self.url_base), # orders by newest first, but seems to return last 10 anyway body=json.dumps({'chunk_size': 1, 'currentPage': 1, 'wfName': workflow_name, 'wfId': workflow_id})) info = '' if workflow_name: info += " name '{0}'".format(workflow_name) if workflow_id: info += " id '{0}'".format(workflow_id) try: json_dict = json.loads(req.content) result = json_dict['result'] not_found_err = '{0}. {1}'.format(info, self.extract_response_message(json_dict)) + \ 'Perhaps you specified the wrong name/id? Use --list to see existing workflows' if result is None: qquit('CRITICAL', "no results found for workflow{0}".format(not_found_err)) reports = result['jobExecutionReports'] if not isList(reports): raise ValueError('jobExecutionReports is not a list') if not reports: qquit('CRITICAL', "no reports found for workflow{0}".format(not_found_err)) # orders by newest first by default, checking last run only report = reports[0] status = report['status'] if status == 'SUCCESS': self.ok() else: self.critical() self.msg += "workflow '{workflow}' id '{id}' status = '{status}'".format(workflow=report['wfName'], id=report['wfId'], status=status) self.check_times(report['startDate'], report['endDate'], max_age, max_runtime) except (KeyError, ValueError) as _: qquit('UNKNOWN', 'error parsing workflow execution history: {0}'.format(_))
def get_version(self): url = 'http://{host}:{port}/solr/admin/info/system'.format( host=self.host, port=self.port) log.debug('GET %s', url) try: req = requests.get(url) except requests.exceptions.RequestException as _: qquit('CRITICAL', _) log.debug('response: %s %s', req.status_code, req.reason) log.debug('content:\n%s\n%s\n%s', '=' * 80, req.content.strip(), '=' * 80) if req.status_code != 200: qquit('CRITICAL', '%s %s' % (req.status_code, req.reason)) soup = BeautifulSoup(req.content, 'html.parser') if log.isEnabledFor(logging.DEBUG): log.debug("BeautifulSoup prettified:\n{0}\n{1}".format( soup.prettify(), '=' * 80)) try: version = soup.find('str', {'name': 'solr-spec-version'}).text except (AttributeError, TypeError) as _: qquit('UNKNOWN', 'failed to find parse Solr output. {0}\n{1}'\ .format(support_msg_api(), traceback.format_exc())) return version
def check_table_regions(self): log.info('checking regions for table \'%s\'', self.table) regions = None try: table = self.conn.table(self.table) regions = table.regions() except HBaseIOError as _: #if 'org.apache.hadoop.hbase.TableNotFoundException' in _.message: if 'TableNotFoundException' in _.message: qquit('CRITICAL', 'table \'{0}\' does not exist'.format(self.table)) else: qquit('CRITICAL', _) except (socket.error, socket.timeout, ThriftException) as _: qquit('CRITICAL', _) if log.isEnabledFor(logging.DEBUG): log.debug('%s', jsonpp(regions)) if not regions: qquit('CRITICAL', 'failed to get regions for table \'{0}\''.format(self.table)) if not isList(regions): qquit('UNKNOWN', 'region info returned is not a list! ' + support_msg_api()) num_regions = len(regions) log.info('num regions: %s', num_regions) self.msg = 'HBase table \'{0}\' has {1} region{2}'.format( self.table, num_regions, plural(num_regions)) self.check_thresholds(num_regions) num_unassigned_regions = 0 for region in regions: try: if not region['server_name']: #log.debug('region \'%s\' is not assigned to any server', region['name']) num_unassigned_regions += 1 except KeyError as _: qquit( 'UNKNOWN', 'failed to find server assigned to region. ' + support_msg_api()) log.info('num unassigned regions: %s', num_unassigned_regions) self.msg += ', {0} unassigned region{1}'.format( num_unassigned_regions, plural(num_unassigned_regions)) if num_unassigned_regions > 0: self.warning() self.msg += '!' self.msg += ' |' self.msg += ' num_regions={0}'.format( num_regions) + self.get_perf_thresholds(boundary='lower') self.msg += ' num_unassigned_regions={0};1;0'.format( num_unassigned_regions) log.info('finished, closing connection') self.conn.close()
def parse_is_table_compacting(content): soup = BeautifulSoup(content, 'html.parser') if log.isEnabledFor(logging.DEBUG): log.debug("BeautifulSoup prettified:\n{0}\n{1}".format( soup.prettify(), '=' * 80)) try: headings = soup.findAll('h2') for heading in headings: log.debug("checking heading '%s'", heading) if heading.get_text() == 'Table Attributes': log.debug('found Table Attributes section header') table = heading.find_next('table') log.debug('checking first following table') if log.isEnabledFor(logging.DEBUG): log.debug('table:\n%s\n%s', table.prettify(), '=' * 80) rows = table.findChildren('tr') if len(rows) < 3: qquit( 'UNKNOWN', 'parse error - less than the 3 expected rows in table attributes' ) col_names = rows[0].findChildren('th') if len(col_names) < 3: qquit( 'UNKNOWN', 'parse error - less than the 3 expected column headings' ) first_col = col_names[0].get_text().strip() if first_col != 'Attribute Name': qquit('UNKNOWN', 'parse error - expected first column header to be \'{0}\' but got \'\' instead. '\ .format('Attribute Name') + support_msg()) for row in rows[1:]: cols = row.findChildren('td') if len(cols) < 3: qquit( 'UNKNOWN', 'parse error - less than the 3 expected columns in table attributes. ' + support_msg()) if cols[0].get_text().strip() == 'Compaction': compaction_state = cols[1].get_text().strip() # NONE when enabled, Unknown when disabled if compaction_state in ('NONE', 'Unknown'): return False return True qquit( 'UNKNOWN', 'parse error - failed to find Table Attributes section in JSP. ' + support_msg()) except (AttributeError, TypeError): qquit('UNKNOWN', 'failed to parse output. ' + support_msg())
def sanity_check(condition, msg): if not condition: qquit( 'UNKNOWN', 'HBase attribute table header ' + msg + ', failed sanity check! ' + support_msg())
def get_table_conn(self): log.info('checking table \'%s\'', self.table) if not self.conn.is_table_enabled(self.table): qquit('CRITICAL', "table '{0}' is not enabled!".format(self.table)) table_conn = self.conn.table(self.table) return table_conn
def run(self): self.no_args() host = self.get_opt('host') port = self.get_opt('port') slave = self.get_opt('slave') list_slaves = self.get_opt('list_slaves') validate_host(host) validate_port(port) if not list_slaves: validate_host(slave, 'slave') url = 'http://%(host)s:%(port)s/master/slaves' % locals() log.debug('GET %s', url) try: req = requests.get(url) except requests.exceptions.RequestException as _: qquit('CRITICAL', _) log.debug("response: %s %s", req.status_code, req.reason) log.debug("content:\n{0}\n{1}\n{2}".format('='*80, req.content.strip(), '='*80)) if req.status_code != 200: if req.status_code == 404: qquit('CRITICAL', '%s %s (did you point this at the correct Mesos Master?)' % (req.status_code, req.reason)) qquit('CRITICAL', "Non-200 response! %s %s" % (req.status_code, req.reason)) content = req.content if not isJson(content): qquit('UNKNOWN', 'invalid JSON returned by Mesos Master') data = json.loads(content) if log.isEnabledFor(logging.DEBUG): log.debug('\n%s', jsonpp(data)) slaves = {} regex = re.compile(r'^slave\(\d+\)\@(.+):\d+') try: for item in data['slaves']: match = regex.match(item['pid']) if match: slaves[item['hostname']] = match.group(1) else: slaves[item['hostname']] = item['pid'] except KeyError: qquit('UNKNOWN', 'failed to parse slaves from Mesos API output. {0}'.format(support_msg_api)) if list_slaves: qquit('UNKNOWN', 'Slaves list:\n\n{0}'.format(dict_lines(slaves))) log.info('found slaves:\n\n{0}\n'.format(dict_lines(slaves))) slave = slave.lower() for _ in slaves: if slave == _.lower() or slave == slaves[_].lower(): qquit('OK', "Mesos slave '{0}' registered with master".format(slave)) break else: qquit('CRITICAL', "Mesos slave '{0}' not registered with master".format(slave))