def __enter__(self): if self.skt is not None: raise BDMonException('BDM-SKT-01: Connection exists on %s:%s' % (self._host, self._port)) self._lgr.info('Connecting to ZK node :%s at port:%s', self._host, self._port) self.skt = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.skt.settimeout(1.0) hostip = socket.gethostbyname(self._host) self._lgr.info("HOST IP to connect: %s", hostip) self.skt.connect((hostip, self._port)) return self.skt
def _get_metrics(self, app_host_port): """ GET JMX data from URIs""" app_uri = self._proto + "://" + app_host_port + self._uripath self._lgr.info('Invoking:%s', app_uri) self._lgr.info('Kerberos setting:%s', self._kerb) self._lgr.info('VerifyTLS setting:%s', self._tlsverify) try: if self._kerb == 'y': res = requests.get(app_uri, timeout=1.0, verify=self._tlsverify, auth=HTTPKerberosAuth()) else: res = requests.get(app_uri, timeout=1.0, verify=self._tlsverify) except (requests.exceptions.ConnectionError, requests.exceptions.Timeout) as err: errmsg = 'BDM-URI-00: Connection error to metrics URI:%s' % app_uri self._lgr.error(errmsg) self._lgr.error(err) self.mtrx['error'] += 1 raise BDMonException(err) if res.status_code == 200: try: dct = res.json() except ValueError as err: errmsg = 'BDM-URI-03: FAILED to get a valid json response' self._lgr.error(errmsg) self._lgr.error(err) self.mtrx['error'] += 1 raise BDMonException(err) self._lgr.debug('GET results:%s' % dct) else: errmsg = 'BDM-URI-05: Application metrics collection error %s' % res.status_code self._lgr.error(errmsg) self.mtrx['error'] += 1 raise BDMonException(errmsg) return dct
def __init__(self, logger, dbdetail=''): self._lgr = logger self.stmt = '' self.values = '' if not dbdetail: dbdetail, rcnt, stime = getdbdetails(self._lgr) errmsg = '' for each in dbdetail.split(';'): if each.startswith('server'): dbsrvr = each.split('=')[1] elif each.startswith('port'): dbport = each.split('=')[1] elif each.startswith('driver'): dbtype = each.split('=')[1] for rtry in range(rcnt): try: if dbtype.lower() == '{sqlite}': self._dbcn = sqlite3.connect(dbsrvr) else: self._dbcn = pyodbc.connect(dbdetail) except (pyodbc.OperationalError, pyodbc.Error) as err: self._lgr.error(str(err)) errmsg = 'BDM-DB-00: Unable to create connection to %s DB.' % dbtype self._lgr.error(errmsg) self._lgr.error('DB Host:%s; DB Port:%s Retry:%s' % (dbsrvr, dbport, rtry + 1)) sleep(stime + rtry) except (sqlite3.OperationalError, sqlite3.Error) as err: self._lgr.error(str(err)) errmsg = 'BDM-DB-01: Unable to open sqlite DB file: %s' % dbsrvr self._lgr.error(errmsg) break else: if dbtype.lower() != '{sqlite}': self._dbcn.autocommit = False self._lgr.debug("Created %s connection", dbtype) else: self._dbcn.row_factory = sqlite3.Row self._lgr.debug("Created sqlite connection") self.crsr = self._dbcn.cursor() if "SQL Server" in dbtype: self.crsr.fast_executemany = True elif "PostgreSQL" in dbtype: self._dbcn.setdecoding(pyodbc.SQL_WCHAR, encoding='utf-8') self._dbcn.setencoding(encoding='utf-8') break if errmsg: raise BDMonException(errmsg)
def execstmt(self): """ Execute the DB statements; cursor object can be iterated for the resultset""" try: self._lgr.debug("DB stmt: %s", self.stmt) self._lgr.debug("values: %s", self.values) if self.stmt.split()[0].lower() == 'insert': if self.values: self.crsr.executemany(self.stmt, self.values) else: self.crsr.executemany(self.stmt) else: if self.values: self.crsr.execute(self.stmt, self.values) else: self.crsr.execute(self.stmt) except (pyodbc.Error, sqlite3.Error) as err: errmsg = 'BDM-DB-05: Unable to execute query.' self._lgr.error(errmsg) self._lgr.error(self.stmt) self._lgr.error(str(err)) raise BDMonException(err)
def _process_metrics(lgr, applst): """Function to process BD metrics data""" # Get the list of applications, components and metrics to collect dct = {} for app in applst: dct[app] = {} lgr.info("List of applications initialized: %s", dct) try: dbo = DbOps(lgr) lgr.info('DB Connection ready') except BDMonException as err: errmsg = 'BDM-PM-00: Unable to create database connection' lgr.error(errmsg) lgr.error(err) raise BDMonException(err) dbo.stmt = "select appname, appcomponent, modelertype, mtypename from t_coll_metrics where \ appname in (" + ",".join( ("'" + x + "'" for x in applst)) + ") and is_active='Y'" dbo.execstmt() for row in dbo.crsr.fetchall(): try: #(appname,appcomponent,modelertype, row.mtypename) dct[row[0]][row[1]].append((row[2], row[3])) except KeyError: dct[row[0]].setdefault(row[1], [ (row[2], row[3]), ]) dbo.stmt = '' lgr.info("List of application metrics to collect confirmed") lgr.debug("List of application metrics: %s", dct) appstime = datetime.now() lgr.info('Start processing of apps at %s ', appstime) #Invoke the processing method for each of the app getmtrx = _BDMProcess(lgr, dbo, dct) for app in applst: stime = datetime.now() lgr.info('Start App processing: %s at %s', app, stime) try: fnc = 'get_metrics_' + app getattr(getmtrx, fnc)() except AttributeError as err: errmsg = 'BDM-APP-01: Invalid application name: %s ; Check config' % app lgr.error(errmsg) lgr.error(err) getmtrx.mtrx['error'] += 1 except BDMonException as err: dbo.rollback() lgr.error('BDM-APP-03: Error while processing application: %s', app) etime = datetime.now() getmtrx.mtrx[app + "CollectionTime"] = (etime - stime).total_seconds() lgr.info('End App processing: %s at %s; Total time:%s', app, etime, etime - stime) #BDMonhost, metricname, numvalue, collection_ts getmtrx.mtrx["totalCollectionTime"] = (etime - appstime).total_seconds() lgr.info('Total processing:- Begin time: %s; End time:%s; Total time:%s', appstime, etime, getmtrx.mtrx["totalCollectionTime"]) dbo.values = [(os.uname()[1], key, val, etime) \ for key, val in getmtrx.mtrx.items()] dbo.stmt = ('insert into t_bdmon_metrics ' '(bdmonhost, metricname, numvalue, collection_ts) ' 'values(?, ?, ?, ?)') dbo.execstmt() dbo.commitclose()
def get_metrics_zookeeper(self): """Function to process zookeeper quorum metrics""" zkq = getzkdetails(self._lgr) #e.g. namenode:2181,snode:2181,datanode1:2181 zknodes = zkq.replace(' ', '').split(',') errmsg = '' for node in zknodes: self._host, port = node.split(':') try: conn = _SocketConn(self._lgr, self._host, int(port)) # Using a context manager to work with sockets with conn as zks: msgs = [] snt = zks.send(str.encode('stat')) #Sent message length should match if snt != 4: errmsg = 'BDM-SK-03: Cannot send stat to ZK node %s:%s' % ( self._host, port) self._lgr.error(errmsg) self.mtrx['error'] += 1 raise BDMonException(errmsg) while True: resp = zks.recv(4096) if resp: msg = resp.decode() self._lgr.debug('ZK data: %s', msg) msgs.append(msg) else: break except (socket.gaierror, ValueError, socket.error, BDMonException) as err: errmsg = 'BDM-SK-00: Connection error to ZK node %s:%s' % ( self._host, port) self._lgr.warning(errmsg) self._lgr.warning("Unable to get zookeeper metrics, Node:%s", node) self._lgr.warning("Received ZK Server error:%s", err) self.mtrx['warning'] += 1 else: if msgs: cltime = datetime.now() resp = ''.join(msgs) self._lgr.info('Response Length:%s', len(resp)) zk_mtrx = [] zk_mode = resp.split('\nMode: ')[1][ 0] #l-leader, f-follower, s-standalone for stat in resp.split('\n'): self._lgr.debug('ZK status item:%s', stat) zkrow = stat.split(':') if stat.startswith('Latency'): # e.g. Latency min/avg/max: 0/0/16 for ltype, lval in zip( zkrow[0].strip('Latency ').split('/'), zkrow[1].strip().split('/')): #hostnode, zk_mode, metricname, numvalue, collection_ts try: zk_mtrx.append((self._host, zk_mode, ltype + '_latency', float(lval), cltime)) except ValueError: self._lgr.warning( 'Ignore zk Latency key %s', ltype) self.mtrx['warning'] += 1 elif "](queued=" in stat: self._lgr.info('ZK client: %s', stat) #e.g. /x.x.x.x:xxxx[0](queued=0,recved=1,sent=0) clnt_host = stat.split(':')[0].replace('/', '') clnt_mtrx = [] for clval in stat.split('](')[1].replace( ')', '').split(','): mname, nval = clval.split('=') self._lgr.info('ZK client mname:%s, val=%s', mname, nval) #zk_hostnode, client_hostnode, metricname, numvalue, collection_ts try: clnt_mtrx.append( (self._host, clnt_host, mname, float(nval), cltime)) except ValueError: self._lgr.warning('Ignore zk client %s', clval) self.mtrx['warning'] += 1 self._dbo.values = clnt_mtrx self._dbo.stmt = self._dbo_stmts["zk_conn_mtrx"] self._bulk_insdb() elif len(zkrow) == 2 and zkrow[0] and zkrow[1]: self._lgr.info('ZK Key: %s ; Val:%s', zkrow[0], zkrow[1]) #hostnode, zk_mode, metricname, numvalue, collection_ts try: zk_mtrx.append((self._host, zk_mode, zkrow[0], float(zkrow[1]), cltime)) except ValueError: self._lgr.warning('Ignore zk key %s', zkrow[0]) self.mtrx['warning'] += 1 self._dbo.values = zk_mtrx self._dbo.stmt = self._dbo_stmts["zk_mtrx"] self._bulk_insdb() else: #We received no response from ZK server within the timeout period errmsg = 'BDM-SK-05: No response from ZK node %s:%s' % ( self._host, port) self._lgr.warning(errmsg) self._lgr.warning( "Unable to get zookeeper metrics, Node:%s", node) self.mtrx['warning'] += 1 if errmsg: # there were errors when capturing ZK metrics on one or more nodes raise BDMonException(errmsg)
def get_metrics_yarn(self): """Function to process YARN active/standby jmx data""" ## Get the resourcemanager nodes, uri protocol, uri_path yarn = getyarndetails(self._lgr) self._proto = yarn["proto"] self._uripath = yarn["uripath"] rmnodes = yarn["rm"].replace(' ', '').split(',') self._lgr.debug("Received yarn config info:%s", yarn) for node in rmnodes: try: jdata = self._get_metrics(node) self._host = node.split(':')[0] except BDMonException as err: self._lgr.warning("Unable to get RM metrics, ignoring Node:%s", node) self._lgr.warning("Received node error:%s", err) self.mtrx['warning'] += 1 continue cltime = datetime.now() for mtrx in jdata["beans"]: if mtrx.get("LiveNodeManagers", "{}") != "{}": self._lgr.info("Getting nodemanagers list") if yarn["nmnodes"]: #restricted NM list rmnodes = yarn["nmnodes"].replace(' ', '').split(',') self._lgr.info("Restricted RMNodes: %s, Total:%s", rmnodes, len(rmnodes)) else: #Get the list of all rmnodes lnodes = ujson.loads(mtrx["LiveNodeManagers"]) self._lgr.info("Total RMNodes to process: %s", len(lnodes)) rmnodes = [rmn["NodeHTTPAddress"] for rmn in lnodes] break for mtrx in jdata["beans"]: mtype = mtrx["modelerType"] self._lgr.debug("RMNode:%s ; Check metrics:%s", self._host, mtype) #self._appmtrx: Dict {appname:{appcomponent:[(modelertype1, mtypename1),...]}} if [ mx for mx in self._appmtrx['yarn']['rm'] if mtype.startswith(mx[0]) ]: self._lgr.info("RMNode:%s ; Process metrics:%s", self._host, mtype) if mtype == "sun.management.OperatingSystemImpl": self._ins_osdata(mtrx, 'yarn', 'rm', cltime) else: self._dbo.values = [(self._host, mtype, key, val, cltime) \ for key, val in mtrx.items() if hasattr(val, 'real') and not isnan(val)] self._dbo.stmt = self._dbo_stmts["yarn_rm"] self._bulk_insdb() #Yarn RMs can be in Active-StandBy mode, #An active node is processed, skip the standby node, because it will point to active break #RMnodes complete, now process NMnodes try: self._lgr.info("NMnodes list: %s", rmnodes) self._get_metrics_workers(rmnodes, yarn["nmport"], 'yarn', 'nm') except UnboundLocalError as err: errmsg = "BDM-YN-00: Looks like we don't have an active ResourceManager" self._lgr.error(errmsg) self._lgr.error("%s", err) self.mtrx['error'] += 1 raise BDMonException(errmsg)
def get_metrics_hbase(self): """Function to process Hbase/Hmaster jmx data""" hbase = gethbasedetails(self._lgr) self._proto = hbase["proto"] self._uripath = hbase["uripath"] hbnodes = hbase["hmaster"].replace(' ', '').split(',') self._lgr.debug("Received HBase config info:%s", hbase) for node in hbnodes: try: jdata = self._get_metrics(node) self._host = node.split(':')[0] except BDMonException as err: self._lgr.warning( "Unable to get HMaster metrics, ignoring Node:%s", node) self._lgr.warning("Received Hmaster error:%s", err) self.mtrx['warning'] += 1 continue is_active = 'N' cltime = datetime.now() for mtrx in jdata["beans"]: if mtrx["modelerType"] == "Master,sub=Server" and \ mtrx["tag.isActiveMaster"] == "true": is_active = 'Y' self._lgr.info("Getting regionservers list") if hbase["regionservers"]: #restricted regionservers rsrvrs = hbase["regionservers"].replace(' ', '').split(',') self._lgr.info("Restricted RS: %s, Total:%s", rsrvrs, len(rsrvrs)) else: #Get the list of all regionservers rsnodes = mtrx["tag.liveRegionServers"] self._lgr.info("RSNodes to process: %s", rsnodes) rsrvrs = [ rs.split(',')[0] for rs in rsnodes.split(';') ] break for mtrx in jdata["beans"]: mtype = mtrx["modelerType"] self._lgr.debug("HMaster:%s ; Check metrics:%s", self._host, mtype) #self._appmtrx: Dict {appname:{appcomponent:[(modelertype1, mtypename1),...]}} if [ mx for mx in self._appmtrx['hbase']['hmaster'] if mtype.startswith(mx[0]) ]: self._lgr.info("HMaster:%s ; Process metrics:%s", self._host, mtype) if mtype == "sun.management.OperatingSystemImpl": self._ins_osdata(mtrx, 'hbase', 'hmaster', cltime) else: self._dbo.values = [(self._host, is_active, mtype, key, val, cltime) \ for key, val in mtrx.items() if hasattr(val, 'real') and not isnan(val)] self._dbo.stmt = self._dbo_stmts["hbase_hmaster"] self._bulk_insdb() #Hmasters complete, now process regionservers try: self._lgr.info("RegionServer list: %s", rsrvrs) self._get_metrics_workers(rsrvrs, hbase["rsport"], 'hbase', 'regionserver') except UnboundLocalError as err: errmsg = "BDM-HB-00: Looks like we don't have an active HMaster, unable to get regionservers" self._lgr.error(errmsg) self._lgr.error("%s", err) self.mtrx['error'] += 1 raise BDMonException(errmsg)
def get_metrics_hdfs(self): """Function to process HDFS namenode/snode jmx data""" ## Get the namenodes, uri protocol, uri_path hdfs = gethdfsdetails(self._lgr) self._proto = hdfs["proto"] self._uripath = hdfs["uripath"] nnodes = hdfs["namenode"].replace(' ', '').split(',') self._lgr.debug("Received HDFS config info:%s", hdfs) for node in nnodes: try: jdata = self._get_metrics(node) self._host = node.split(':')[0] except BDMonException as err: self._lgr.warning( "Unable to get namenode metrics, ignoring Node:%s", node) self._lgr.warning("Received namenode error:%s", err) self.mtrx['warning'] += 1 continue is_active = 'N' cltime = datetime.now() for mtrx in jdata["beans"]: if mtrx.get("tag.HAState", 'N') == 'active': is_active = 'Y' elif mtrx.get("LiveNodes", "{}") != "{}": self._lgr.info("Getting datanodes list") if hdfs["datanodes"]: #restricted datanode list dnodes = hdfs["datanodes"].replace(' ', '').split(',') self._lgr.info("Restricted dataNodes: %s, Total:%s", dnodes, len(dnodes)) else: #Get the list of all datanodes lnodes = ujson.loads(mtrx["LiveNodes"]) self._lgr.info("DataNodes to process: %s", lnodes.keys()) dnodes = [lnodes[dna]["infoAddr"] for dna in lnodes] for mtrx in jdata["beans"]: mtype = mtrx["modelerType"] self._lgr.debug("NameNode:%s ; Check metrics:%s", self._host, mtype) #Metrics can be a substring from the start, hence the startswith #comes in handy with a lot metrics of the similar modelerType #e.g. instead of "sun.management.OperatingSystemImpl", it can be "sun.management.O" #self._appmtrx: Dict {appname:{appcomponent:[(modelertype1, mtypename1),...]}} if [ mx for mx in self._appmtrx['hdfs']['namenode'] if mtype.startswith(mx[0]) ]: self._lgr.info("NameNode:%s ; Process metrics:%s", self._host, mtype) if mtype == "sun.management.OperatingSystemImpl": self._ins_osdata(mtrx, 'hdfs', 'namenode', cltime) else: self._dbo.values = [(self._host, is_active, mtype, key, val, cltime) \ for key, val in mtrx.items() if hasattr(val, 'real') and not isnan(val)] self._dbo.stmt = self._dbo_stmts["hdfs_namenode"] self._bulk_insdb() #Namenodes complete, now process datanodes try: self._lgr.info("Datanodes list: %s", dnodes) self._get_metrics_workers(dnodes, hdfs["dnport"], 'hdfs', 'datanode') except UnboundLocalError as err: errmsg = "BDM-HD-00: Looks like we don't have an active Namenode" self._lgr.error(errmsg) self._lgr.error("%s", err) self.mtrx['error'] += 1 raise BDMonException(errmsg)