Exemple #1
0
 def __enter__(self):
     if self.skt is not None:
         raise BDMonException('BDM-SKT-01: Connection exists on %s:%s' %
                              (self._host, self._port))
     self._lgr.info('Connecting to ZK node :%s at port:%s', self._host,
                    self._port)
     self.skt = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     self.skt.settimeout(1.0)
     hostip = socket.gethostbyname(self._host)
     self._lgr.info("HOST IP to connect: %s", hostip)
     self.skt.connect((hostip, self._port))
     return self.skt
Exemple #2
0
 def _get_metrics(self, app_host_port):
     """ GET JMX data from URIs"""
     app_uri = self._proto + "://" + app_host_port + self._uripath
     self._lgr.info('Invoking:%s', app_uri)
     self._lgr.info('Kerberos setting:%s', self._kerb)
     self._lgr.info('VerifyTLS setting:%s', self._tlsverify)
     try:
         if self._kerb == 'y':
             res = requests.get(app_uri,
                                timeout=1.0,
                                verify=self._tlsverify,
                                auth=HTTPKerberosAuth())
         else:
             res = requests.get(app_uri,
                                timeout=1.0,
                                verify=self._tlsverify)
     except (requests.exceptions.ConnectionError,
             requests.exceptions.Timeout) as err:
         errmsg = 'BDM-URI-00: Connection error to metrics URI:%s' % app_uri
         self._lgr.error(errmsg)
         self._lgr.error(err)
         self.mtrx['error'] += 1
         raise BDMonException(err)
     if res.status_code == 200:
         try:
             dct = res.json()
         except ValueError as err:
             errmsg = 'BDM-URI-03: FAILED to get a valid json response'
             self._lgr.error(errmsg)
             self._lgr.error(err)
             self.mtrx['error'] += 1
             raise BDMonException(err)
         self._lgr.debug('GET results:%s' % dct)
     else:
         errmsg = 'BDM-URI-05: Application metrics collection error %s' % res.status_code
         self._lgr.error(errmsg)
         self.mtrx['error'] += 1
         raise BDMonException(errmsg)
     return dct
Exemple #3
0
    def __init__(self, logger, dbdetail=''):
        self._lgr = logger
        self.stmt = ''
        self.values = ''
        if not dbdetail:
            dbdetail, rcnt, stime = getdbdetails(self._lgr)
        errmsg = ''

        for each in dbdetail.split(';'):
            if each.startswith('server'):
                dbsrvr = each.split('=')[1]
            elif each.startswith('port'):
                dbport = each.split('=')[1]
            elif each.startswith('driver'):
                dbtype = each.split('=')[1]
        for rtry in range(rcnt):
            try:
                if dbtype.lower() == '{sqlite}':
                    self._dbcn = sqlite3.connect(dbsrvr)
                else:
                    self._dbcn = pyodbc.connect(dbdetail)
            except (pyodbc.OperationalError, pyodbc.Error) as err:
                self._lgr.error(str(err))
                errmsg = 'BDM-DB-00: Unable to create connection to %s DB.' % dbtype
                self._lgr.error(errmsg)
                self._lgr.error('DB Host:%s; DB Port:%s Retry:%s' %
                                (dbsrvr, dbport, rtry + 1))
                sleep(stime + rtry)
            except (sqlite3.OperationalError, sqlite3.Error) as err:
                self._lgr.error(str(err))
                errmsg = 'BDM-DB-01: Unable to open sqlite DB file: %s' % dbsrvr
                self._lgr.error(errmsg)
                break
            else:
                if dbtype.lower() != '{sqlite}':
                    self._dbcn.autocommit = False
                    self._lgr.debug("Created %s connection", dbtype)
                else:
                    self._dbcn.row_factory = sqlite3.Row
                    self._lgr.debug("Created sqlite connection")
                self.crsr = self._dbcn.cursor()
                if "SQL Server" in dbtype:
                    self.crsr.fast_executemany = True
                elif "PostgreSQL" in dbtype:
                    self._dbcn.setdecoding(pyodbc.SQL_WCHAR, encoding='utf-8')
                    self._dbcn.setencoding(encoding='utf-8')
                break
        if errmsg:
            raise BDMonException(errmsg)
Exemple #4
0
 def execstmt(self):
     """ Execute the DB statements; cursor object can be iterated for the resultset"""
     try:
         self._lgr.debug("DB stmt: %s", self.stmt)
         self._lgr.debug("values: %s", self.values)
         if self.stmt.split()[0].lower() == 'insert':
             if self.values:
                 self.crsr.executemany(self.stmt, self.values)
             else:
                 self.crsr.executemany(self.stmt)
         else:
             if self.values:
                 self.crsr.execute(self.stmt, self.values)
             else:
                 self.crsr.execute(self.stmt)
     except (pyodbc.Error, sqlite3.Error) as err:
         errmsg = 'BDM-DB-05: Unable to execute query.'
         self._lgr.error(errmsg)
         self._lgr.error(self.stmt)
         self._lgr.error(str(err))
         raise BDMonException(err)
Exemple #5
0
def _process_metrics(lgr, applst):
    """Function to process BD metrics data"""
    # Get the list of applications, components and metrics to collect
    dct = {}
    for app in applst:
        dct[app] = {}
    lgr.info("List of applications initialized: %s", dct)
    try:
        dbo = DbOps(lgr)
        lgr.info('DB Connection ready')
    except BDMonException as err:
        errmsg = 'BDM-PM-00: Unable to create database connection'
        lgr.error(errmsg)
        lgr.error(err)
        raise BDMonException(err)
    dbo.stmt = "select appname, appcomponent, modelertype, mtypename from t_coll_metrics where \
                appname in (" + ",".join(
        ("'" + x + "'" for x in applst)) + ") and is_active='Y'"
    dbo.execstmt()
    for row in dbo.crsr.fetchall():
        try:
            #(appname,appcomponent,modelertype, row.mtypename)
            dct[row[0]][row[1]].append((row[2], row[3]))
        except KeyError:
            dct[row[0]].setdefault(row[1], [
                (row[2], row[3]),
            ])

    dbo.stmt = ''
    lgr.info("List of application metrics to collect confirmed")
    lgr.debug("List of application metrics: %s", dct)
    appstime = datetime.now()
    lgr.info('Start processing of apps at %s ', appstime)
    #Invoke the processing method for each of the app
    getmtrx = _BDMProcess(lgr, dbo, dct)
    for app in applst:
        stime = datetime.now()
        lgr.info('Start App processing: %s at %s', app, stime)
        try:
            fnc = 'get_metrics_' + app
            getattr(getmtrx, fnc)()
        except AttributeError as err:
            errmsg = 'BDM-APP-01: Invalid application name: %s ; Check config' % app
            lgr.error(errmsg)
            lgr.error(err)
            getmtrx.mtrx['error'] += 1
        except BDMonException as err:
            dbo.rollback()
            lgr.error('BDM-APP-03: Error while processing application: %s',
                      app)
        etime = datetime.now()
        getmtrx.mtrx[app + "CollectionTime"] = (etime - stime).total_seconds()
        lgr.info('End App processing: %s at %s; Total time:%s', app, etime,
                 etime - stime)
    #BDMonhost, metricname, numvalue, collection_ts
    getmtrx.mtrx["totalCollectionTime"] = (etime - appstime).total_seconds()
    lgr.info('Total processing:- Begin time: %s; End time:%s; Total time:%s',
             appstime, etime, getmtrx.mtrx["totalCollectionTime"])
    dbo.values = [(os.uname()[1], key, val, etime) \
                    for key, val in getmtrx.mtrx.items()]
    dbo.stmt = ('insert into t_bdmon_metrics '
                '(bdmonhost, metricname, numvalue, collection_ts) '
                'values(?, ?, ?, ?)')
    dbo.execstmt()
    dbo.commitclose()
Exemple #6
0
 def get_metrics_zookeeper(self):
     """Function to process zookeeper quorum metrics"""
     zkq = getzkdetails(self._lgr)
     #e.g. namenode:2181,snode:2181,datanode1:2181
     zknodes = zkq.replace(' ', '').split(',')
     errmsg = ''
     for node in zknodes:
         self._host, port = node.split(':')
         try:
             conn = _SocketConn(self._lgr, self._host, int(port))
             # Using a context manager to work with sockets
             with conn as zks:
                 msgs = []
                 snt = zks.send(str.encode('stat'))
                 #Sent message length should match
                 if snt != 4:
                     errmsg = 'BDM-SK-03: Cannot send stat to ZK node %s:%s' % (
                         self._host, port)
                     self._lgr.error(errmsg)
                     self.mtrx['error'] += 1
                     raise BDMonException(errmsg)
                 while True:
                     resp = zks.recv(4096)
                     if resp:
                         msg = resp.decode()
                         self._lgr.debug('ZK data: %s', msg)
                         msgs.append(msg)
                     else:
                         break
         except (socket.gaierror, ValueError, socket.error,
                 BDMonException) as err:
             errmsg = 'BDM-SK-00: Connection error to ZK node %s:%s' % (
                 self._host, port)
             self._lgr.warning(errmsg)
             self._lgr.warning("Unable to get zookeeper metrics, Node:%s",
                               node)
             self._lgr.warning("Received ZK Server error:%s", err)
             self.mtrx['warning'] += 1
         else:
             if msgs:
                 cltime = datetime.now()
                 resp = ''.join(msgs)
                 self._lgr.info('Response Length:%s', len(resp))
                 zk_mtrx = []
                 zk_mode = resp.split('\nMode: ')[1][
                     0]  #l-leader, f-follower, s-standalone
                 for stat in resp.split('\n'):
                     self._lgr.debug('ZK status item:%s', stat)
                     zkrow = stat.split(':')
                     if stat.startswith('Latency'):
                         # e.g. Latency min/avg/max: 0/0/16
                         for ltype, lval in zip(
                                 zkrow[0].strip('Latency ').split('/'),
                                 zkrow[1].strip().split('/')):
                             #hostnode, zk_mode, metricname, numvalue, collection_ts
                             try:
                                 zk_mtrx.append((self._host, zk_mode,
                                                 ltype + '_latency',
                                                 float(lval), cltime))
                             except ValueError:
                                 self._lgr.warning(
                                     'Ignore zk Latency key %s', ltype)
                                 self.mtrx['warning'] += 1
                     elif "](queued=" in stat:
                         self._lgr.info('ZK client: %s', stat)
                         #e.g. /x.x.x.x:xxxx[0](queued=0,recved=1,sent=0)
                         clnt_host = stat.split(':')[0].replace('/', '')
                         clnt_mtrx = []
                         for clval in stat.split('](')[1].replace(
                                 ')', '').split(','):
                             mname, nval = clval.split('=')
                             self._lgr.info('ZK client mname:%s, val=%s',
                                            mname, nval)
                             #zk_hostnode, client_hostnode, metricname, numvalue, collection_ts
                             try:
                                 clnt_mtrx.append(
                                     (self._host, clnt_host, mname,
                                      float(nval), cltime))
                             except ValueError:
                                 self._lgr.warning('Ignore zk client %s',
                                                   clval)
                                 self.mtrx['warning'] += 1
                         self._dbo.values = clnt_mtrx
                         self._dbo.stmt = self._dbo_stmts["zk_conn_mtrx"]
                         self._bulk_insdb()
                     elif len(zkrow) == 2 and zkrow[0] and zkrow[1]:
                         self._lgr.info('ZK Key: %s ; Val:%s', zkrow[0],
                                        zkrow[1])
                         #hostnode, zk_mode, metricname, numvalue, collection_ts
                         try:
                             zk_mtrx.append((self._host, zk_mode, zkrow[0],
                                             float(zkrow[1]), cltime))
                         except ValueError:
                             self._lgr.warning('Ignore zk key %s', zkrow[0])
                             self.mtrx['warning'] += 1
                 self._dbo.values = zk_mtrx
                 self._dbo.stmt = self._dbo_stmts["zk_mtrx"]
                 self._bulk_insdb()
             else:  #We received no response from ZK server within the timeout period
                 errmsg = 'BDM-SK-05: No response from ZK node %s:%s' % (
                     self._host, port)
                 self._lgr.warning(errmsg)
                 self._lgr.warning(
                     "Unable to get zookeeper metrics, Node:%s", node)
                 self.mtrx['warning'] += 1
     if errmsg:  # there were errors when capturing ZK metrics on one or more nodes
         raise BDMonException(errmsg)
Exemple #7
0
 def get_metrics_yarn(self):
     """Function to process YARN active/standby jmx data"""
     ## Get the resourcemanager nodes, uri protocol, uri_path
     yarn = getyarndetails(self._lgr)
     self._proto = yarn["proto"]
     self._uripath = yarn["uripath"]
     rmnodes = yarn["rm"].replace(' ', '').split(',')
     self._lgr.debug("Received yarn config info:%s", yarn)
     for node in rmnodes:
         try:
             jdata = self._get_metrics(node)
             self._host = node.split(':')[0]
         except BDMonException as err:
             self._lgr.warning("Unable to get RM metrics, ignoring Node:%s",
                               node)
             self._lgr.warning("Received node error:%s", err)
             self.mtrx['warning'] += 1
             continue
         cltime = datetime.now()
         for mtrx in jdata["beans"]:
             if mtrx.get("LiveNodeManagers", "{}") != "{}":
                 self._lgr.info("Getting nodemanagers list")
                 if yarn["nmnodes"]:  #restricted NM list
                     rmnodes = yarn["nmnodes"].replace(' ', '').split(',')
                     self._lgr.info("Restricted RMNodes: %s, Total:%s",
                                    rmnodes, len(rmnodes))
                 else:  #Get the list of all rmnodes
                     lnodes = ujson.loads(mtrx["LiveNodeManagers"])
                     self._lgr.info("Total RMNodes to process: %s",
                                    len(lnodes))
                     rmnodes = [rmn["NodeHTTPAddress"] for rmn in lnodes]
                 break
         for mtrx in jdata["beans"]:
             mtype = mtrx["modelerType"]
             self._lgr.debug("RMNode:%s ; Check metrics:%s", self._host,
                             mtype)
             #self._appmtrx: Dict {appname:{appcomponent:[(modelertype1, mtypename1),...]}}
             if [
                     mx for mx in self._appmtrx['yarn']['rm']
                     if mtype.startswith(mx[0])
             ]:
                 self._lgr.info("RMNode:%s ; Process metrics:%s",
                                self._host, mtype)
                 if mtype == "sun.management.OperatingSystemImpl":
                     self._ins_osdata(mtrx, 'yarn', 'rm', cltime)
                 else:
                     self._dbo.values = [(self._host, mtype, key, val, cltime) \
                                         for key, val in mtrx.items()
                                         if hasattr(val, 'real') and not isnan(val)]
                     self._dbo.stmt = self._dbo_stmts["yarn_rm"]
                     self._bulk_insdb()
         #Yarn RMs can be in Active-StandBy mode,
         #An active node is processed, skip the standby node, because it will point to active
         break
     #RMnodes complete, now process NMnodes
     try:
         self._lgr.info("NMnodes list: %s", rmnodes)
         self._get_metrics_workers(rmnodes, yarn["nmport"], 'yarn', 'nm')
     except UnboundLocalError as err:
         errmsg = "BDM-YN-00: Looks like we don't have an active ResourceManager"
         self._lgr.error(errmsg)
         self._lgr.error("%s", err)
         self.mtrx['error'] += 1
         raise BDMonException(errmsg)
Exemple #8
0
 def get_metrics_hbase(self):
     """Function to process Hbase/Hmaster jmx data"""
     hbase = gethbasedetails(self._lgr)
     self._proto = hbase["proto"]
     self._uripath = hbase["uripath"]
     hbnodes = hbase["hmaster"].replace(' ', '').split(',')
     self._lgr.debug("Received HBase config info:%s", hbase)
     for node in hbnodes:
         try:
             jdata = self._get_metrics(node)
             self._host = node.split(':')[0]
         except BDMonException as err:
             self._lgr.warning(
                 "Unable to get HMaster metrics, ignoring Node:%s", node)
             self._lgr.warning("Received Hmaster error:%s", err)
             self.mtrx['warning'] += 1
             continue
         is_active = 'N'
         cltime = datetime.now()
         for mtrx in jdata["beans"]:
             if mtrx["modelerType"] == "Master,sub=Server" and \
                mtrx["tag.isActiveMaster"] == "true":
                 is_active = 'Y'
                 self._lgr.info("Getting regionservers list")
                 if hbase["regionservers"]:  #restricted regionservers
                     rsrvrs = hbase["regionservers"].replace(' ',
                                                             '').split(',')
                     self._lgr.info("Restricted RS: %s, Total:%s", rsrvrs,
                                    len(rsrvrs))
                 else:  #Get the list of all regionservers
                     rsnodes = mtrx["tag.liveRegionServers"]
                     self._lgr.info("RSNodes to process: %s", rsnodes)
                     rsrvrs = [
                         rs.split(',')[0] for rs in rsnodes.split(';')
                     ]
                 break
         for mtrx in jdata["beans"]:
             mtype = mtrx["modelerType"]
             self._lgr.debug("HMaster:%s ; Check metrics:%s", self._host,
                             mtype)
             #self._appmtrx: Dict {appname:{appcomponent:[(modelertype1, mtypename1),...]}}
             if [
                     mx for mx in self._appmtrx['hbase']['hmaster']
                     if mtype.startswith(mx[0])
             ]:
                 self._lgr.info("HMaster:%s ; Process metrics:%s",
                                self._host, mtype)
                 if mtype == "sun.management.OperatingSystemImpl":
                     self._ins_osdata(mtrx, 'hbase', 'hmaster', cltime)
                 else:
                     self._dbo.values = [(self._host, is_active, mtype, key, val, cltime) \
                                         for key, val in mtrx.items()
                                         if hasattr(val, 'real') and not isnan(val)]
                     self._dbo.stmt = self._dbo_stmts["hbase_hmaster"]
                     self._bulk_insdb()
     #Hmasters complete, now process regionservers
     try:
         self._lgr.info("RegionServer list: %s", rsrvrs)
         self._get_metrics_workers(rsrvrs, hbase["rsport"], 'hbase',
                                   'regionserver')
     except UnboundLocalError as err:
         errmsg = "BDM-HB-00: Looks like we don't have an active HMaster, unable to get regionservers"
         self._lgr.error(errmsg)
         self._lgr.error("%s", err)
         self.mtrx['error'] += 1
         raise BDMonException(errmsg)
Exemple #9
0
 def get_metrics_hdfs(self):
     """Function to process HDFS namenode/snode jmx data"""
     ## Get the namenodes, uri protocol, uri_path
     hdfs = gethdfsdetails(self._lgr)
     self._proto = hdfs["proto"]
     self._uripath = hdfs["uripath"]
     nnodes = hdfs["namenode"].replace(' ', '').split(',')
     self._lgr.debug("Received HDFS config info:%s", hdfs)
     for node in nnodes:
         try:
             jdata = self._get_metrics(node)
             self._host = node.split(':')[0]
         except BDMonException as err:
             self._lgr.warning(
                 "Unable to get namenode metrics, ignoring Node:%s", node)
             self._lgr.warning("Received namenode error:%s", err)
             self.mtrx['warning'] += 1
             continue
         is_active = 'N'
         cltime = datetime.now()
         for mtrx in jdata["beans"]:
             if mtrx.get("tag.HAState", 'N') == 'active':
                 is_active = 'Y'
             elif mtrx.get("LiveNodes", "{}") != "{}":
                 self._lgr.info("Getting datanodes list")
                 if hdfs["datanodes"]:  #restricted datanode list
                     dnodes = hdfs["datanodes"].replace(' ', '').split(',')
                     self._lgr.info("Restricted dataNodes: %s, Total:%s",
                                    dnodes, len(dnodes))
                 else:  #Get the list of all datanodes
                     lnodes = ujson.loads(mtrx["LiveNodes"])
                     self._lgr.info("DataNodes to process: %s",
                                    lnodes.keys())
                     dnodes = [lnodes[dna]["infoAddr"] for dna in lnodes]
         for mtrx in jdata["beans"]:
             mtype = mtrx["modelerType"]
             self._lgr.debug("NameNode:%s ; Check metrics:%s", self._host,
                             mtype)
             #Metrics can be a substring from the start, hence the startswith
             #comes in handy with a lot metrics of the similar modelerType
             #e.g. instead of "sun.management.OperatingSystemImpl", it can be "sun.management.O"
             #self._appmtrx: Dict {appname:{appcomponent:[(modelertype1, mtypename1),...]}}
             if [
                     mx for mx in self._appmtrx['hdfs']['namenode']
                     if mtype.startswith(mx[0])
             ]:
                 self._lgr.info("NameNode:%s ; Process metrics:%s",
                                self._host, mtype)
                 if mtype == "sun.management.OperatingSystemImpl":
                     self._ins_osdata(mtrx, 'hdfs', 'namenode', cltime)
                 else:
                     self._dbo.values = [(self._host, is_active, mtype, key, val, cltime) \
                                         for key, val in mtrx.items()
                                         if hasattr(val, 'real') and not isnan(val)]
                     self._dbo.stmt = self._dbo_stmts["hdfs_namenode"]
                     self._bulk_insdb()
     #Namenodes complete, now process datanodes
     try:
         self._lgr.info("Datanodes list: %s", dnodes)
         self._get_metrics_workers(dnodes, hdfs["dnport"], 'hdfs',
                                   'datanode')
     except UnboundLocalError as err:
         errmsg = "BDM-HD-00: Looks like we don't have an active Namenode"
         self._lgr.error(errmsg)
         self._lgr.error("%s", err)
         self.mtrx['error'] += 1
         raise BDMonException(errmsg)