def unregi_url(url): try: sql = 'SELECT * FROM ' + DB.REGI_SYS_TBL + ' WHERE url = \'' + url + '\'' with DB.connection() as conn: url_info = conn.cursor().execute(sql).fetchall() conn.close() # if no exist if len(url_info) == 0: res_body = {'Result': 'SUCCESS'} else: # delete db sql = 'DELETE FROM ' + DB.REGI_SYS_TBL + ' WHERE url = \'' + url + '\'' ret = DB.sql_execute(sql) if ret == 'SUCCESS': res_body = {'Result': 'SUCCESS'} else: res_body = {'Result': 'FAIL'} return res_body except: LOG.exception() return {'Result': 'FAIL'}
def proc_dis_system(node, dummy): try: result = dict() for sys_type in CONF.watchdog()['check_system']: event_list = DB.get_event_list(sys_type) sql = 'SELECT ' + DB.STATUS_TBL + '.nodename, ' + DB.NODE_INFO_TBL + '.ip_addr, ' + ", ".join(event_list) + ' FROM ' + DB.STATUS_TBL + \ ' INNER JOIN ' + DB.NODE_INFO_TBL + ' ON ' + DB.STATUS_TBL + '.nodename = ' + DB.NODE_INFO_TBL + '.nodename WHERE type = \'' + sys_type + '\'' if not node == 'all': sql = sql + ' and ' + DB.NODE_INFO_TBL + '.nodename = \'' + node + '\'' with DB.connection() as conn: nodes_info = conn.cursor().execute(sql).fetchall() conn.close() for row in nodes_info: line = dict() line['TYPE'] = sys_type line['IP'] = row[1] i = 2 for item in event_list: line[item] = row[i] i = i + 1 result[row[0]] = line return result except: LOG.exception() return {'Result': 'FAIL'}
def regi_url(url, auth): try: sql = 'SELECT * FROM ' + DB.REGI_SYS_TBL + ' WHERE url = \'' + url + '\'' with DB.connection() as conn: url_info = conn.cursor().execute(sql).fetchall() conn.close() # if already exist if len(url_info) == 1: res_body = {'Result': 'SUCCESS'} else: # insert db sql = 'INSERT INTO ' + DB.REGI_SYS_TBL + ' VALUES (\'' + url + '\', \'' + auth + '\' )' ret = DB.sql_execute(sql) if ret == 'SUCCESS': res_body = {'Result': 'SUCCESS'} else: res_body = {'Result': 'FAIL'} return res_body except: LOG.exception() return {'Result': 'FAIL'}
def get_event_list(url, auth): try: sql_evt = 'SELECT * FROM ' + DB.EVENT_TBL with DB.connection() as conn: evt_list = conn.cursor().execute(sql_evt).fetchall() conn.close() event_list = [] for nodename, item, grade, pre_grade, reason, time in evt_list: evt = { 'event': 'occur', 'system': nodename, 'item': item, 'grade': grade, 'pre_grade': pre_grade, 'reason': 'fail_reason', 'time': time } event_list.append(evt) res_body = {'Result': 'SUCCESS', 'Event list': event_list} return res_body except: LOG.exception() return {'Result': 'FAIL'}
def push_event(node_name, item, grade, desc, time): sql = 'SELECT * FROM ' + DB.REGI_SYS_TBL with DB.connection() as conn: url_list = conn.cursor().execute(sql).fetchall() conn.close() for url, auth in url_list: header = {'Content-Type': 'application/json', 'Authorization': auth} req_body = { 'event': 'occur', 'system': node_name, 'item': item, 'grade': grade, 'desc': desc, 'time': time } req_body_json = json.dumps(req_body) try: requests.post(url, headers=header, data=req_body_json, timeout=2) except: # rest timeout LOG.exception()
def xos_status_check(conn, db_log, node_name): xos_status = 'ok' xos_list = [] fail_reason = [] try: url = CONF.xos()['xos_rest_server'] account = CONF.xos()['xos_rest_account'] cmd = 'curl -H "Accept: application/json; indent=4" -u ' + account + ' -X GET ' + url + '/api/core/xoses/' result = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) output, error = result.communicate() if result.returncode != 0: LOG.error("Cmd Fail, cause => %s", error) return 'fail', None xos_array = json.loads(output) for xos_info in xos_array: backend_status = xos_info['backend_status'] LOG.info('xos_status_backend_status = ' + backend_status) tmp = str(backend_status).split('-') if tmp[0].strip() == '0': status = 'ok' else: status = 'nok' xos_json = { 'name': xos_info['name'], 'status': status, 'description': tmp[1].strip() } xos_list.append(xos_json) if status == 'nok': xos_status = 'nok' fail_reason.append(xos_json) try: sql = 'UPDATE ' + DB.XOS_TBL + \ ' SET xos_status = \"' + str(xos_list) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE XOS STATUS INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] XOS STATUS DB Update Fail.') except: LOG.exception() except: LOG.exception() xos_status = 'fail' return xos_status, fail_reason
def regi_url(url, auth): try: sql = 'SELECT * FROM ' + DB.REGI_SYS_TBL + ' WHERE url = \'' + url + '\'' sql_evt = 'SELECT * FROM ' + DB.EVENT_TBL with DB.connection() as conn: url_info = conn.cursor().execute(sql).fetchall() evt_list = conn.cursor().execute(sql_evt).fetchall() conn.close() event_list = [] for nodename, item, grade, desc, time in evt_list: if not grade in ['ok', 'normal']: evt = { 'event': 'occur', 'system': nodename, 'item': item, 'grade': grade, 'desc': desc, 'time': time } event_list.append(evt) # if already exist if len(url_info) == 1: res_body = {'Result': 'SUCCESS', 'Event list': event_list} else: # insert db sql = 'INSERT INTO ' + DB.REGI_SYS_TBL + ' VALUES (\'' + url + '\', \'' + auth + '\' )' ret = DB.sql_execute(sql) if ret == 'SUCCESS': res_body = {'Result': 'SUCCESS', 'Event list': event_list} else: res_body = {'Result': 'FAIL'} return res_body except: LOG.exception() return {'Result': 'FAIL'}
def onos_ha_check(conn, db_log): try: stats_url = CONF.ha()['ha_proxy_server'] account = CONF.ha()['ha_proxy_account'] cmd = 'curl --user ' + account + ' --header \'Accept: text/html, application/xhtml+xml, image/jxr, */*\' \"' + stats_url + '\"' result = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) output, error = result.communicate() if result.returncode != 0: LOG.error("Cmd Fail, cause => %s", error) return None else: report_data = csv.DictReader(output.lstrip('# ').splitlines()) dic_stat = dict() for row in report_data: if row['pxname'].strip() == 'stats' or row['svname'].strip( ) == 'BACKEND': continue dtl_list = { 'name': row['svname'], 'req_count': row['stot'], 'succ_count': row['hrsp_2xx'], 'node_sts': row['status'] } svc_type = row['pxname'] if (dic_stat.has_key(svc_type)): dic_stat[svc_type].append(dtl_list) else: dic_stat[svc_type] = list() dic_stat[svc_type].append(dtl_list) try: str_dic_stat = str(dic_stat) sql = 'UPDATE ' + DB.HA_TBL + \ ' SET stats = \"' + str_dic_stat + '\"' + \ ' WHERE ha_key = \"' + 'HA' + '\"' db_log.write_log('----- UPDATE HA INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] HA DB Update Fail.') except: LOG.exception() return dic_stat except: LOG.exception() return None
def push_event(node_name, item, grade, pre_grade, reason, time, flush_alarm): global history_log try: history_log.write_log('[%s][%s][%s->%s] %s', node_name, item, pre_grade, grade, reason) sql = 'SELECT * FROM ' + DB.REGI_SYS_TBL with DB.connection() as conn: url_list = conn.cursor().execute(sql).fetchall() conn.close() for url, auth in url_list: header = { 'Content-Type': 'application/json', 'Authorization': str(auth) } req_body = { 'system': node_name, 'item': item, 'grade': grade, 'pre_grade': pre_grade, 'reason': reason, 'time': time } req_body_json = json.dumps(req_body) try: requests.post(str(url), headers=header, data=req_body_json, timeout=2) except: # Push event does not respond pass reason_str = '' if type(reason) == list: if len(reason) > 0: reason_str = '-- ' + '\n-- '.join(reason) else: reason_str = str(reason) ALARM.queue_alarm(node_name + ' ' + item + ' ' + grade.upper(), reason_str, time) if flush_alarm: ALARM.flush_pending_alarm() except: LOG.exception()
def occur_event(conn, node_name, item, pre_value, cur_value): time = str(datetime.now()) desc = pre_value + ' -> ' + cur_value sql = 'UPDATE ' + DB.EVENT_TBL + \ ' SET grade = \'' + cur_value + '\'' + ',' + \ ' desc = \'' + desc + '\'' + ',' + \ ' time = \'' + time + '\'' + \ ' WHERE nodename = \'' + node_name + '\' and item = \'' + item + '\'' LOG.info('Update alarm info = ' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': LOG.error('DB Update Fail.') push_event(node_name, item, cur_value, desc, time)
def get_node_list(nodes, param, tbl=DB.NODE_INFO_TBL): try: if nodes == 'all': sql = 'SELECT ' + param + ' FROM ' + tbl else: sql = 'SELECT ' + param + ' FROM ' + tbl + ' WHERE nodename = \'' + nodes + '\'' with DB.connection() as conn: nodes_info = conn.cursor().execute(sql).fetchall() conn.close() return nodes_info except: LOG.exception() return None
def proc_dis_ha(dummy, param): try: sql = 'SELECT stats FROM ' + DB.HA_TBL + ' WHERE ha_key = \'HA\'' with DB.connection() as conn: nodes_info = conn.cursor().execute(sql).fetchone() conn.close() for value in nodes_info: return json.loads(str(value).replace('\'', '\"')) return {'HA': 'FAIL'} except: LOG.exception() return {'Result': 'FAIL'}
def run(self): # DB initiation DB.db_initiation() # Start RESTful server try: REST_SVR.rest_server_start() except: print 'Rest Server failed to start' LOG.exception() sys.exit(1) # Periodic monitoring if CONF.watchdog()['interval'] == 0: LOG.info("--- Not running periodic monitoring ---") while True: time.sleep(3600) else: LOG.info("--- Periodic Monitoring Start ---") conn = DB.connection() while True: try: watchdog.periodic(conn) time.sleep(CONF.watchdog()['interval']) except: watchdog.push_event('sonawatcher', 'disconnect', 'critical', 'sonawatcher server shutdown', str(datetime.now())) conn.close() LOG.exception() sys.exit(1)
def occur_event(conn, db_log, node_name, item, pre_grade, cur_grade, reason): try: time = str(datetime.now()) sql = 'UPDATE ' + DB.EVENT_TBL + \ ' SET grade = \'' + cur_grade + '\'' + ',' + \ ' pre_grade = \'' + pre_grade + '\'' + ',' + \ ' reason = \"' + str(reason) + '\"' + ',' + \ ' time = \'' + time + '\'' + \ ' WHERE nodename = \'' + node_name + '\' and item = \'' + item + '\'' db_log.write_log('----- UPDATE EVENT INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] EVENT INFO DB Update Fail.') push_event(node_name, item, cur_grade, pre_grade, reason, time, False) except: LOG.exception()
def push_event(node_name, item, grade, pre_grade, reason, time): global history_log try: history_log.write_log('[%s][%s][%s][%s] %s', node_name, item, grade, pre_grade, reason) sql = 'SELECT * FROM ' + DB.REGI_SYS_TBL with DB.connection() as conn: url_list = conn.cursor().execute(sql).fetchall() conn.close() for url, auth in url_list: header = { 'Content-Type': 'application/json', 'Authorization': str(auth) } req_body = { 'system': node_name, 'item': item, 'grade': grade, 'pre_grade': pre_grade, 'reason': reason, 'time': time } req_body_json = json.dumps(req_body) try: requests.post(str(url), headers=header, data=req_body_json, timeout=2) except: # Push event does not respond pass except: LOG.exception()
def check_resource(conn, db_log, node_name, user_name, node_ip): try: cpu = str(get_cpu_usage(user_name, node_ip, True)) mem = str(get_mem_usage(user_name, node_ip, True)) disk = str(get_disk_usage(user_name, node_ip, True)) try: sql = 'UPDATE ' + DB.RESOURCE_TBL + \ ' SET cpu = \'' + cpu + '\',' + \ ' memory = \'' + mem + '\',' + \ ' disk = \'' + disk + '\'' \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE RESOURCE INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] RESOURCE DB Update Fail.') except: LOG.exception() return cpu, mem, disk except: LOG.exception() return -1, -1, -1
def swarm_check(conn, db_log, node_name, user_name, node_ip): str_node = '' str_service = '' str_ps = '' ret_app = 'ok' ret_node = 'ok' node_rt = SshCommand.ssh_exec(user_name, node_ip, 'sudo docker node ls') if node_rt is not None: try: leader_flag = False for line in node_rt.splitlines(): line = line.decode('utf-8') str_node = str_node + line + '\n' if line.startswith('ID'): continue if 'Leader' in line: leader_flag = True if not ('Ready' in line and 'Active' in line): ret_node = 'nok' break if 'Down' in line: ret_node = 'nok' break if not leader_flag: ret_node = 'nok' except: LOG.exception() ret_node = 'nok' else: LOG.error("\'%s\' Swarm Node Check Error", node_ip) str_node = 'fail' service_rt = SshCommand.ssh_exec(user_name, node_ip, 'sudo docker service ls') if service_rt is not None: try: for app in CONF.swarm()['app_list']: find_flag = False for line in service_rt.splitlines(): line = line.decode('utf-8') if line.startswith('ID'): continue id, name, mode, rep, img = line.split() if app == name: find_flag = True rep_tmp = rep.split('/') if not (rep_tmp[0] == rep_tmp[1]): ret_app = 'nok' break if not find_flag: ret_app = 'nok' break except: LOG.exception() ret_app = 'nok' for line in service_rt.splitlines(): line = line.decode('utf-8') str_service = str_service + line + '\n' else: LOG.error("\'%s\' Swarm Service Check Error", node_ip) str_service = 'fail' ret_app = 'nok' try: for app in CONF.swarm()['app_list']: ps_rt = SshCommand.ssh_exec(user_name, node_ip, 'sudo docker service ps ' + app) str_ps = str_ps + ' * ' + app + '\n\n' if ps_rt is not None: for line in ps_rt.splitlines(): line = line.decode('utf-8') str_ps = str_ps + line + '\n' else: LOG.error("\'%s\' Swarm PS Check Error", node_ip) str_ps = str_ps + 'Command failure(' + app + ')\n' str_ps = str_ps + '\n' except: LOG.exception() try: sql = 'UPDATE ' + DB.SWARM_TBL + \ ' SET node = \'' + str_node + '\',' + \ ' service = \'' + str_service + '\',' + \ ' ps = \'' + str_ps + '\'' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE SWARM INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] SWARN DB Update Fail.') except: LOG.exception() return ret_app, ret_node
def swarm_node_check(conn, db_log, node_name, username, node_ip, swarm_manager): node_status = 'ok' node_list = [] fail_reason = [] try: cmd = 'ssh root@' + swarm_manager + ' \"sudo docker node ls\"' node_rt = SshCommand.ssh_exec(username, node_ip, cmd) if node_rt is not None: try: leader_flag = False for line in node_rt.splitlines(): line = line.decode('utf-8') line = " ".join(line.replace('*', '').split()) tmp = line.split(' ') if line.startswith('ID'): continue if 'Leader' in line: node_json = { 'hostname': tmp[1], 'status': tmp[2], 'availability': tmp[3], 'manager': tmp[4] } leader_flag = True if not ('Ready' in line and 'Active' in line): node_status = 'nok' fail_reason.append(tmp[1] + ' node is not ready.') else: node_json = { 'hostname': tmp[1], 'status': tmp[2], 'availability': tmp[3], 'manager': '' } if 'Down' in line: node_status = 'nok' fail_reason.append(tmp[1] + ' node is down.') node_list.append(node_json) if not leader_flag: node_status = 'nok' fail_reason.append('swarm leader node does not exist.') except: LOG.exception() node_status = 'nok' else: LOG.error("\'%s\' Swarm Node Check Error", node_ip) node_status = 'fail' try: sql = 'UPDATE ' + DB.SWARM_TBL + \ ' SET node = \"' + str(node_list) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE SWARM NODE INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] SWARM NODE DB Update Fail.') except: LOG.exception() except: LOG.exception() node_status = 'fail' return node_status, fail_reason
def swarm_service_check(conn, db_log, node_name, username, node_ip, swarm_manager): service_status = 'ok' service_list = [] ps_list = [] fail_reason = [] try: cmd = 'ssh root@' + swarm_manager + ' \"sudo docker service ls\"' service_rt = SshCommand.ssh_exec(username, node_ip, cmd) instance_list = get_service_list() if service_rt is not None: try: for svc in instance_list: find_flag = False for line in service_rt.splitlines(): line = line.decode('utf-8') if line.startswith('ID'): continue id, name, mode, rep, img = line.split() if svc == name: find_flag = True rep_tmp = rep.split('/') if not (rep_tmp[0] == rep_tmp[1]): service_status = 'nok' svc_json = { 'name': name, 'mode': mode, 'replicas': rep, 'image': img, 'status': 'nok', 'monitor_item': True } fail_reason.append(svc_json) else: svc_json = { 'name': name, 'mode': mode, 'replicas': rep, 'image': img, 'status': 'ok', 'monitor_item': True } service_list.append(svc_json) if not find_flag: service_status = 'nok' fail_reason.append('swarm ' + svc + ' service does not exist.') break for line in service_rt.splitlines(): line = line.decode('utf-8') if line.startswith('ID'): continue id, name, mode, rep, img = line.split() if name in instance_list: continue rep_tmp = rep.split('/') if not (rep_tmp[0] == rep_tmp[1]): svc_json = { 'name': name, 'mode': mode, 'replicas': rep, 'image': img, 'status': 'nok', 'monitor_item': False } else: svc_json = { 'name': name, 'mode': mode, 'replicas': rep, 'image': img, 'status': 'ok', 'monitor_item': False } service_list.append(svc_json) except: LOG.exception() service_status = 'fail' else: LOG.error("\'%s\' Swarm Service Check Error", node_ip) service_status = 'fail' for app in instance_list: cmd = 'ssh root@' + swarm_manager + ' \"sudo docker service ps ' + app + '\"' ps_rt = SshCommand.ssh_exec(username, node_ip, cmd) if ps_rt is not None: for line in ps_rt.splitlines(): line = line.decode('utf-8') if line.startswith('ID'): continue line = line.replace(' \_ ', '') line = " ".join(line.split()) tmp = line.split(' ') ps_json = { 'name': tmp[1], 'image': tmp[2], 'node': tmp[3], 'desired_state': tmp[4], 'current_state': tmp[5] } ps_list.append(ps_json) else: LOG.error("\'%s\' Swarm PS Check Error", node_ip) try: sql = 'UPDATE ' + DB.SWARM_TBL + \ ' SET service = \"' + str(service_list) + '\",' + \ ' ps = \"' + str(ps_list) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE SWARM SERVICE/PS INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] SWARM SERVICE/PS DB Update Fail.') except: LOG.exception() except: LOG.exception() service_status = 'fail' return service_status, fail_reason
def onos_app_check(conn, db_log, node_name, node_ip): try: app_rt = SshCommand.onos_ssh_exec(node_ip, 'apps -a -s') status = 'ok' app_active_list = list() app_list = [] fail_reason = [] if app_rt is not None: for line in app_rt.splitlines(): app_active_list.append(line.split(".")[2].split()[0]) if not 'cpman' in app_active_list: # activate cpman LOG.info('Cpman does not exist. Activate cpman') SshCommand.onos_ssh_exec(node_ip, 'app activate org.onosproject.cpman') for app in CONF.onos()['app_list']: if app in app_active_list: app_json = { 'name': app, 'status': 'ok', 'monitor_item': True } app_active_list.remove(app) else: status = 'nok' app_json = { 'name': app, 'status': 'nok', 'monitor_item': True } fail_reason.append(app_json) app_list.append(app_json) for app in app_active_list: app_json = {'name': app, 'status': 'ok', 'monitor_item': False} app_list.append(app_json) else: LOG.error("\'%s\' ONOS Application Check Error", node_ip) status = 'fail' app_list = 'fail' try: sql = 'UPDATE ' + DB.ONOS_TBL + \ ' SET applist = \"' + str(app_list) + '\"' +\ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE ONOS APP INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] ONOS APP DB Update Fail.') except: LOG.exception() except: LOG.exception() status = 'fail' return status, fail_reason
def periodic(conn, pre_stat, db_log): try: cur_info = {} #LOG.info('Periodic checking %s', str(CONF.watchdog()['check_system'])) try: node_list = cmd_proc.get_node_list('all', 'nodename, ip_addr, username, type, sub_type') if not node_list: LOG.info("Not Exist Node data ...") return except: LOG.exception() return # Read cur alarm status sql = 'SELECT nodename, item, grade FROM ' + DB.EVENT_TBL db_log.write_log(sql) cur_grade = conn.cursor().execute(sql).fetchall() old_nok_count = 0; for nodename, item, grade in cur_grade: if not cur_info.has_key(nodename): cur_info[nodename] = {} cur_info[nodename][item] = grade if grade != 'ok': old_nok_count += 1 new_nok_count = 0; for node_name, node_ip, user_name, type, sub_type in node_list: #LOG.info('------------------------------------ ' + node_name + ' START ------------------------------------') onos_cluster = 'fail' onos_device = 'fail' onos_link = 'fail' onos_app = 'fail' # ping check ping = net_check(node_ip) ping_reason = [] if ping != 'ok': reason.append('ping check failed on ' + node_ip) new_nok_count += 1 ping = alarm_event.process_event(conn, db_log, node_name, type, 'PING', cur_info[node_name]['PING'], ping, ping_reason) if ping == 'ok': if type.upper() == 'ONOS': # check connection onos_cluster, onos_device, onos_link, onos_app, cluster_reason, device_reason, link_reason, app_reason = chk_onos.onos_check(conn, db_log, node_name, node_ip) onos_cluster = alarm_event.process_event(conn, db_log, node_name, type, 'ONOS_CLUSTER', cur_info[node_name]['ONOS_CLUSTER'], onos_cluster, cluster_reason) onos_device = alarm_event.process_event(conn, db_log, node_name, type, 'ONOS_DEVICE', cur_info[node_name]['ONOS_DEVICE'], onos_device, device_reason) onos_link = alarm_event.process_event(conn, db_log, node_name, type, 'ONOS_LINK', cur_info[node_name]['ONOS_LINK'], onos_link, link_reason) onos_app = alarm_event.process_event(conn, db_log, node_name, type, 'ONOS_APP', cur_info[node_name]['ONOS_APP'], onos_app, app_reason) if onos_cluster != 'ok': new_nok_count += 1 if onos_device != 'ok': new_nok_count += 1 if onos_link != 'ok': new_nok_count += 1 if onos_app != 'ok': new_nok_count += 1 try: sql = 'UPDATE ' + DB.STATUS_TBL + \ ' SET' + \ ' PING = \'' + ping + '\',' + \ ' ONOS_CLUSTER = \'' + onos_cluster + '\',' + \ ' ONOS_DEVICE = \'' + onos_device + '\',' + \ ' ONOS_LINK = \'' + onos_link + '\',' + \ ' ONOS_APP = \'' + onos_app + '\',' + \ ' time = \'' + str(datetime.now()) + '\'' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE TOTAL SYSTEM INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] TOTAL SYSTEM INFO DB Update Fail.') except: LOG.exception() # do not version log on everthing is ok if old_nok_count > 0: LOG.info('chk_onos[%s]: ping=%s cluster=%s device=%s link=%s app=%s' % (node_name, ping, onos_cluster, onos_device, onos_link, onos_app)) if old_nok_count > 0 and new_nok_count == 0: alarm_event.process_event(conn, db_log, 'ALL', 'SITE', 'STATUS', 'none', 'ok', []) # send all alarm messages pending alarm_event.flush_event_alarm(); except: LOG.exception() return pre_stat
def vrouter_check(conn, db_log, node_name, user_name, node_ip): ret_docker = 'ok' docker_list = [] fail_list = [] onos_id = '' docker_rt = SshCommand.ssh_exec(user_name, node_ip, 'sudo docker ps') if docker_rt is not None: try: for docker in CONF.openstack()['docker_list']: for line in docker_rt.splitlines(): if line.startswith('CONTAINER'): continue tmp_line = line.split() if ' ' + docker in line: if not 'Up' in line: docker_json = {'name': docker, 'status': 'nok', 'type': 'docker'} fail_list.append(docker_json) ret_docker = 'nok' else: docker_json = {'name': docker, 'status': 'ok', 'type': 'docker'} docker_list.append(docker_json) if 'onos' in tmp_line[1]: onos_id = tmp_line[0] except: LOG.exception() else: LOG.error("\'%s\' Vrouter Node Check Error", node_ip) ret_docker = 'fail' onos_app_list = [] route_list = [] if not onos_id == '': try: # get onos container ip onos_rt = SshCommand.ssh_exec(user_name, node_ip, 'sudo docker inspect ' + onos_id + ' | grep IPAddress') if onos_rt is not None: for line in onos_rt.splitlines(): line = line.strip() if line.startswith('\"IPAddress'): tmp = line.split(':') onos_ip = tmp[1].strip().replace('\"', '').replace(',', '') break app_list = SshCommand.ssh_pexpect(user_name, node_ip, onos_ip, 'apps -a -s') app_active_list = list() for line in app_list.splitlines(): if line.startswith('fail'): continue app_active_list.append(line.split(".")[2].split()[0]) for app in CONF.openstack()['onos_vrouter_app_list']: if app in app_active_list: app_json = {'name': app, 'status': 'ok', 'type': 'onos_app'} else: app_json = {'name': app, 'status': 'nok', 'type': 'onos_app'} fail_list.append(app_json) ret_docker = 'nok' onos_app_list.append(app_json) str_route = SshCommand.ssh_pexpect(user_name, node_ip, onos_ip, 'routes') for line in str_route.splitlines(): line = line.strip() if (line.startswith('Table') or line.startswith('Network') or line.startswith('Total')): continue new_line = " ".join(line.split()) if new_line.startswith('fail'): continue tmp = new_line.split(' ') route_json = {'network': tmp[0], 'next_hop': tmp[1]} route_list.append(route_json) except: LOG.exception() else: LOG.info('can not find onos_id.') ret_docker = 'fail' try: sql = 'UPDATE ' + DB.OPENSTACK_TBL + \ ' SET docker = \"' + str(docker_list) + '\",' + \ ' onosApp = \"' + str(onos_app_list) + '\",' + \ ' routingTable = \"' + str(route_list) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE GATEWAY INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] GATEWAY DB Update Fail.') except: LOG.exception() return ret_docker, fail_list
def run(self): db_log = USER_LOG() db_log.set_log('db.log', CONF.base()['log_rotate_time'], CONF.base()['log_backup_count']) pre_stat = dict() # DB initiation DB.db_initiation(db_log) # Start RESTful server try: REST_SVR.rest_server_start() except: print 'Rest Server failed to start' LOG.exception() self.exit() # Periodic monitoring if CONF.watchdog()['interval'] == 0: LOG.info("--- Not running periodic monitoring ---") while True: time.sleep(3600) else: LOG.info("--- Periodic Monitoring Start ---") history_log.write_log("--- Event History Start ---") conn = DB.connection() exitFlag = False while True: try: i = 0 while i < 3: i = i + 1 # check rest server try: url = 'http://' + socket.gethostbyname( socket.gethostname()) + ':' + str(CONF.rest( )['rest_server_port']) + '/alive-check' cmd = 'curl -X GET \"' + url + '\"' LOG.info('cmd = ' + cmd) result = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) output, error = result.communicate() if result.returncode != 0: LOG.info('REST SERVER CHECK FAIL [' + str(i) + ']') if i == 3: LOG.info('fail to check rest server.') alarm_event.push_event( 'sonawatcher', 'SONAWATCHER_DISCONNECT', 'critical', 'normal', 'sonawatcher server shutdown', str(datetime.now())) conn.close() exitFlag = True self.exit() break else: break except: LOG.exception() if exitFlag: break pre_stat = watchdog.periodic(conn, pre_stat, db_log) time.sleep(CONF.watchdog()['interval']) except: alarm_event.push_event('sonawatcher', 'SONAWATCHER_DISCONNECT', 'critical', 'normal', 'sonawatcher server shutdown', str(datetime.now())) conn.close() LOG.exception()
def get_node_traffic(conn, db_log, node_name, rx_dic, tx_dic, total_rx, total_tx, err_info, pre_stat): try: status = 'ok' reason_list = [] pre_total_rx = total_rx pre_total_tx = total_tx # check minimum packet count sql = 'SELECT data_ip FROM ' + DB.OPENSTACK_TBL + ' WHERE nodename = \'' + node_name + '\'' data_ip = conn.cursor().execute(sql).fetchone()[0] sql = 'SELECT ip_addr FROM ' + DB.NODE_INFO_TBL + ' WHERE type = \'ONOS\'' nodes_info = conn.cursor().execute(sql).fetchall() min_rx = 0 if len(nodes_info) == 0: LOG.info('Fail to load onos list') status = 'fail' else: for ip in nodes_info: flows_rt = SshCommand.onos_ssh_exec(ip[0], '\"flows --filter \'{tunnelDst=' + data_ip + '}\' --short\"') if flows_rt is not None: for line in flows_rt.splitlines(): if 'tunnelDst' in line: min_rx = min_rx + int(line.split(',')[2].split('=')[1]) break if not dict(pre_stat).has_key(node_name + '_VXLAN'): status = '-' ratio = -1 else: total_rx = total_rx - int(dict(pre_stat)[node_name + '_VXLAN']['total_rx']) total_tx = total_tx - int(dict(pre_stat)[node_name + '_VXLAN']['total_tx']) cur_min = min_rx - int(dict(pre_stat)[node_name + '_VXLAN']['min_rx']) if total_rx == 0 and total_tx == 0: ratio = 100 elif total_tx <= 0 or total_tx < 0: LOG.info('Node Traffic Ratio Fail.') ratio = 0 else: ratio = float(total_rx) * 100 / total_tx LOG.info('Node Traffic Ratio = ' + str(ratio)) port_json = {'rx': rx_dic[node_name], 'minimum_rx': min_rx, 'rx_drop': err_info['rx_drop'], 'rx_errs': err_info['rx_err'], 'tx': tx_dic[node_name], 'tx_drop': err_info['tx_drop'], 'tx_errs': err_info['tx_err']} description = '' if not status == '-': description = 'Ratio of success for all nodes = ' + str(ratio) + ' (' + str(total_rx) + ' / ' + str(total_tx) + ')' if ratio < float(CONF.alarm()['node_traffic_ratio']): LOG.info('[NODE TRAFFIC] ratio nok') status = 'nok' if total_rx < cur_min: LOG.info('CUR_MIN_RX = ' + str(cur_min) + ', CUR_RX = ' + str(total_rx) + ', Less than rx minimum.') status = 'nok' if err_info['rx_drop'] - int(dict(pre_stat)[node_name + '_VXLAN']['rx_drop']) > 0: LOG.info('[NODE TRAFFIC] rx_drop nok') status = 'nok' if err_info['rx_err'] - int(dict(pre_stat)[node_name + '_VXLAN']['rx_err']) > 0: LOG.info('[NODE TRAFFIC] rx_err nok') status = 'nok' if err_info['tx_drop'] - int(dict(pre_stat)[node_name + '_VXLAN']['tx_drop']) > 0: LOG.info('[NODE TRAFFIC] tx_drop nok') status = 'nok' if err_info['tx_err'] - int(dict(pre_stat)[node_name + '_VXLAN']['tx_err']) > 0: LOG.info('[NODE TRAFFIC] tx_err nok') status = 'nok' in_out_dic = dict() in_out_dic['total_rx'] = pre_total_rx in_out_dic['total_tx'] = pre_total_tx in_out_dic['min_rx'] = min_rx in_out_dic['rx_drop'] = err_info['rx_drop'] in_out_dic['rx_err'] = err_info['rx_err'] in_out_dic['tx_drop'] = err_info['tx_drop'] in_out_dic['tx_err'] = err_info['tx_err'] pre_stat[node_name + '_VXLAN'] = in_out_dic except: LOG.exception() status = 'fail' vxlan_json = {'port_stat_vxlan': port_json, 'period': CONF.watchdog()['interval'], 'ratio': format(ratio, '.2f'), 'current_rx': total_rx, 'current_tx': total_tx, 'description': description, 'threshold': CONF.alarm()['node_traffic_ratio'], 'status': status} try: sql = 'UPDATE ' + DB.OPENSTACK_TBL + \ ' SET vxlan_traffic = \"' + str(vxlan_json) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE VXLAN STAT INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] VXLAN STAT DB Update Fail.') except: LOG.exception() if not status == 'ok': reason_list.append(vxlan_json) return status, pre_stat, reason_list
def get_internal_traffic(conn, db_log, node_name, node_ip, user_name, sub_type, rx_count, patch_tx, pre_stat): try: status = 'ok' in_packet = 0 out_packet = 0 reason_list = [] desc = '' if sub_type == 'COMPUTE': flow_rt = SshCommand.ssh_exec(user_name, node_ip, 'sudo ovs-ofctl -O OpenFlow13 dump-flows br-int') inport_cnt = 0 gw_cnt = 0 output_cnt = 0 if flow_rt is not None: for line in flow_rt.splitlines(): tmp = line.split(',') if 'in_port' in line: inport_cnt = inport_cnt + int(tmp[3].split('=')[1]) elif 'output' in line: output_cnt = output_cnt + int(tmp[3].split('=')[1]) elif 'actions=group' in line: gw_cnt = gw_cnt + int(tmp[3].split('=')[1]) in_packet = inport_cnt + rx_count out_packet = gw_cnt + output_cnt port_json = {'vm_tx': inport_cnt, 'vxlan_rx': rx_count, 'out_gw': gw_cnt, 'output': output_cnt} else: port_json = {'vm_tx': -1, 'vxlan_rx': -1, 'out_gw': -1, 'output': -1} status = 'fail' else: port_json = {'vxlan_rx': rx_count, 'patch-integ': patch_tx} if patch_tx == -1: status = 'fail' else: in_packet = rx_count out_packet = patch_tx for_save_in = in_packet for_save_out = out_packet if not dict(pre_stat).has_key(node_name + '_internal'): status = '-' vxlan_json = {'port_stat_in_out': port_json, 'period': CONF.watchdog()['interval'], 'ratio': 0, 'current_rx': -1, 'current_tx': -1, 'description': desc, 'threshold': CONF.alarm()['internal_traffic_ratio'], 'status': status} elif status == 'ok': in_packet = in_packet - int(dict(pre_stat)[node_name + '_internal']['in_packet']) out_packet = out_packet - int(dict(pre_stat)[node_name + '_internal']['out_packet']) if in_packet == 0 and out_packet == 0: ratio = 100 elif in_packet <= 0 or out_packet < 0: LOG.info('Internal Traffic Ratio Fail.') ratio = 0 else: ratio = float(out_packet) * 100 / in_packet LOG.info('Internal Traffic Ratio = ' + str(ratio)) desc = 'Internal Traffic Ratio = ' + str(ratio) + '(' + str(out_packet) + '/' + str(in_packet) + ')' if ratio < float(CONF.alarm()['internal_traffic_ratio']): status = 'nok' vxlan_json = {'port_stat_in_out': port_json, 'period': CONF.watchdog()['interval'], 'ratio': format(ratio, '.2f'), 'current_rx': in_packet, 'current_tx': out_packet, 'description': desc, 'threshold': CONF.alarm()['internal_traffic_ratio'], 'status': status} in_out_dic = dict() in_out_dic['in_packet'] = for_save_in in_out_dic['out_packet'] = for_save_out pre_stat[node_name + '_internal'] = in_out_dic try: sql = 'UPDATE ' + DB.OPENSTACK_TBL + \ ' SET internal_traffic = \"' + str(vxlan_json) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE INTERNAL TRAFFIC INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] INTERNAL TRAFFIC DB Update Fail.') except: LOG.exception() except: LOG.exception() status = 'fail' if not status == 'ok': reason_list.append(vxlan_json) return status, pre_stat, reason_list
def get_gw_ratio_compute(conn, db_log, node_ip, node_name, pre_stat): status = 'ok' reason = [] try: sql = 'SELECT ' + DB.ONOS_TBL + '.nodename, nodelist, ip_addr' + ' FROM ' + DB.ONOS_TBL + \ ' INNER JOIN ' + DB.NODE_INFO_TBL + ' ON ' + DB.ONOS_TBL + '.nodename = ' + DB.NODE_INFO_TBL + '.nodename' nodes_info = conn.cursor().execute(sql).fetchall() if len(nodes_info) == 0: LOG.info('Fail to load onos list') return 'fail', pre_stat, reason manage_ip = '' hostname = '' for nodename, nodelist, ip in nodes_info: if not nodelist == 'none': for node_info in eval(nodelist): try: if dict(node_info)['management_ip'] == node_ip: manage_ip = ip hostname = dict(node_info)['hostname'] except: manage_ip = '' if not manage_ip == '': break if not manage_ip == '': break if hostname == '': LOG.info('Can not find hostname') return 'fail', pre_stat, reason try: sql = 'SELECT of_id FROM ' + DB.OPENSTACK_TBL + ' WHERE hostname = \'' + str(hostname) + '\'' LOG.info(sql) node_info = conn.cursor().execute(sql).fetchone() of_id = node_info[0] except: LOG.exception() LOG.info('Can not find of_id') return 'fail', pre_stat, reason group_rt = SshCommand.onos_ssh_exec(manage_ip, 'groups') total_cnt = 0 gw_list = [] if group_rt is not None: for line in group_rt.splitlines(): if of_id in line: tmp = line.split(',') for col in tmp: if 'packets=' in col: total_cnt = total_cnt + int(col.split('=')[1]) gw_list.append(int(col.split('=')[1])) str_ratio = '' if not dict(pre_stat).has_key(node_name + '_GW'): status = '-' json_ratio = {'ratio': '-', 'status': status, 'period':CONF.watchdog()['interval'], 'status': status} else: i = 0 for gw in gw_list: cur_gw = gw - pre_stat[node_name + '_GW']['gw_list'][i] cur_total = total_cnt - pre_stat[node_name + '_GW']['gw_total'] LOG.info('cur_gw = ' + str(cur_gw)) LOG.info('cur_total = ' + str(cur_total)) if cur_gw == 0 and cur_total == 0: ratio = 100/len(gw_list) elif cur_gw <= 0 or cur_total <= 0: ratio = 0 else: ratio = float(cur_gw) * 100 / cur_total i = i + 1 str_ratio = str_ratio + str(ratio) + ':' if ratio < float(CONF.alarm()['gw_ratio']): status = 'nok' json_ratio = {'ratio': str_ratio.rstrip(':'), 'status': status, 'period':CONF.watchdog()['interval'], 'status': status} LOG.info('[COMPUTE] ' + 'GW_RATIO = ' + str_ratio.rstrip(':')) try: sql = 'UPDATE ' + DB.OPENSTACK_TBL + \ ' SET gw_ratio = \"' + str(json_ratio) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE TRAFFIC GW INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] TRAFFIC GW DB Update Fail.') except: LOG.exception() in_out_dic = dict() in_out_dic['gw_list'] = gw_list in_out_dic['gw_total'] = total_cnt pre_stat[node_name + '_GW'] = in_out_dic except: LOG.exception() status = 'fail' if not status == 'ok': reason.append(json_ratio) return status, pre_stat, reason
def get_gw_ratio_gateway(conn, db_log, node_ip, node_name, rx, gw_rx_sum, pre_stat): status = 'ok' reason = [] try: sql = 'SELECT ' + DB.ONOS_TBL + '.nodename, nodelist, ip_addr' + ' FROM ' + DB.ONOS_TBL + \ ' INNER JOIN ' + DB.NODE_INFO_TBL + ' ON ' + DB.ONOS_TBL + '.nodename = ' + DB.NODE_INFO_TBL + '.nodename' nodes_info = conn.cursor().execute(sql).fetchall() if len(nodes_info) == 0: LOG.info('Fail to load onos list') return 'fail', pre_stat, reason # search data_ip data_ip = '' manage_ip = '' cpt_to_gw_packet = 0 for nodename, nodelist, ip in nodes_info: if not nodelist == 'none': for node_info in eval(nodelist): try: if dict(node_info)['management_ip'] == node_ip: manage_ip = ip data_ip = dict(node_info)['data_ip'] except: manage_ip = '' if not manage_ip == '': break if not manage_ip == '': break if data_ip == '': LOG.info('Can not find data ip') return 'fail', pre_stat, reason group_rt = SshCommand.onos_ssh_exec(manage_ip, 'groups') if group_rt is not None: for line in group_rt.splitlines(): if '{tunnelDst=' + data_ip + '}' in line: tmp = line.split(',') for col in tmp: if 'packets=' in col: cpt_to_gw_packet = cpt_to_gw_packet + int(col.split('=')[1]) if not dict(pre_stat).has_key(node_name + '_GW'): status = '-' json_ratio = {'current_rx': '-', 'current_compute_tx': '-', 'current_total': '-', 'ratio': '-', 'period': CONF.watchdog()['interval'], 'status': status, 'packet_loss': False, 'description': ''} else: cur_rx = rx - int(dict(pre_stat)[node_name + '_GW']['rx']) cur_total = gw_rx_sum - int(dict(pre_stat)[node_name + '_GW']['gw_rx_sum']) cur_packet = cpt_to_gw_packet - int(dict(pre_stat)[node_name + '_GW']['cpt_to_gw_packet']) if cur_rx == 0 and cur_total == 0: ratio = 100 elif cur_rx <= 0 or cur_total < 0: ratio = 0 else: ratio = float(cur_rx) * 100 / cur_total desc = 'GW RATIO = ' + str(ratio) + ' (' + str(cur_rx) + ' / ' + str(cur_total) + ')' loss_flag = False if cur_rx < cur_packet: LOG.info('GW Ratio Fail. (Data loss)') loss_flag = True LOG.info('GW Ratio = ' + str(ratio)) if ratio < float(CONF.alarm()['gw_ratio']) or cur_rx < cur_packet: status = 'nok' json_ratio = {'current_rx': cur_rx, 'current_compute_tx': cur_packet, 'current_total': cur_total, 'ratio': format(ratio, '.2f'), 'period':CONF.watchdog()['interval'], 'status': status, 'packet_loss': loss_flag, 'description': desc} try: sql = 'UPDATE ' + DB.OPENSTACK_TBL + \ ' SET gw_ratio = \"' + str(json_ratio) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE TRAFFIC GW INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] TRAFFIC GW DB Update Fail.') except: LOG.exception() in_out_dic = dict() in_out_dic['rx'] = rx in_out_dic['gw_rx_sum'] = gw_rx_sum in_out_dic['cpt_to_gw_packet'] = cpt_to_gw_packet pre_stat[node_name + '_GW'] = in_out_dic except: LOG.exception() status = 'fail' if not status == 'ok': reason.append(json_ratio) return status, pre_stat, reason
def onos_rest_check(conn, db_log, node_name, node_ip): try: web_status = 'ok' web_list = [] fail_reason = [] web_rt = SshCommand.onos_ssh_exec(node_ip, 'web:list') if web_rt is not None: for web in CONF.onos()['rest_list']: for line in web_rt.splitlines(): if line.startswith('ID') or line.startswith('--'): continue if ' ' + web + ' ' in line: if not ('Active' in line and 'Deployed' in line): rest_json = { 'name': web, 'status': 'nok', 'monitor_item': True } fail_reason.append(rest_json) web_status = 'nok' else: rest_json = { 'name': web, 'status': 'ok', 'monitor_item': True } web_list.append(rest_json) for line in web_rt.splitlines(): if line.startswith('ID') or line.startswith('--'): continue name = " ".join(line.split()).split(' ')[10] if not name in CONF.onos()['rest_list']: if not ('Active' in line and 'Deployed' in line): rest_json = { 'name': name, 'status': 'nok', 'monitor_item': False } else: rest_json = { 'name': name, 'status': 'ok', 'monitor_item': False } web_list.append(rest_json) else: LOG.error("\'%s\' ONOS Rest Check Error", node_ip) web_status = 'fail' web_list = 'fail' try: sql = 'UPDATE ' + DB.ONOS_TBL + \ ' SET weblist = \"' + str(web_list) + '\"' +\ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE ONOS REST INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] ONOS REST DB Update Fail.') except: LOG.exception() except: LOG.exception() web_status = 'fail' return web_status, fail_reason
def xos_sync_check(conn, db_log, node_name): swarm_sync = 'ok' sync_list = [] fail_reason = [] try: url = CONF.xos()['xos_rest_server'] account = CONF.xos()['xos_rest_account'] cmd = 'curl -H "Accept: application/json; indent=4" -u ' + account + ' -X GET ' + url + '/api/core/diags/' result = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) output, error = result.communicate() if result.returncode != 0: LOG.error("Cmd Fail, cause => %s", error) return 'fail', None sync_array = json.loads(output) for xos_info in sync_array: backend_status = xos_info['backend_status'] LOG.info('xos_sync_backend_status = ' + backend_status) tmp = str(backend_status).split('-') if tmp[0].strip() in ['0', '1']: status = 'ok' else: status = 'nok' # check time last_time = json.loads(xos_info['backend_register'])['last_run'] cur_time = time.time() interval = cur_time - last_time interval = int(interval) if interval >= 30: status = 'nok' xos_json = { 'name': xos_info['name'], 'status': status, 'description': tmp[1].strip(), 'last_run_interval': interval } sync_list.append(xos_json) if status == 'nok': swarm_sync = 'nok' fail_reason.append(xos_json) try: sql = 'UPDATE ' + DB.XOS_TBL + \ ' SET synchronizer = \"' + str(sync_list) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE SYNCHRONIZER INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] SYNCHRONIZER DB Update Fail.') except: LOG.exception() except: LOG.exception() swarm_sync = 'fail' return swarm_sync, fail_reason
def onos_check(conn, db_log, node_name, node_ip): # called on each ONOS node in NODE_INFO_TBL try: # check cluster nodes node_list = [] node_status = 'ok' node_fail_reason = [] ret, rsp = onos_api_req(node_ip, 'onos/v1/cluster') if rsp is not None: try: node_tbl = dict() for node in rsp['nodes']: node_tbl[node['ip']] = node for onos_node in CONF.onos()['list']: if len(onos_node.split(':')) != 2: continue id = onos_node.split(':')[0] ip = onos_node.split(':')[1] if id is '' or ip is '': continue if ip in node_tbl: node = node_tbl[ip] node['id'] = id node['monitor_item'] = True if node['status'] != 'READY': node_status = 'nok' node_fail_reason.append('Node ' + id + ' DOWN') node_tbl.pop(ip) else: node = { 'id': id, 'ip': ip, 'status': 'nok', 'monitor_item': True } node_status = 'nok' node_fail_reason.append('Node ' + id + ' DOWN') node_list.append(node) for node in node_tbl.values(): node['monitor_item'] = False node_list.append(node) except: LOG.exception() LOG.error("\'%s\' ONOS Check Error(nodes)", node_ip) node_status = 'fail' # check devices device_list = [] device_status = 'ok' device_fail_reason = [] ret, rsp = onos_api_req(node_ip, 'onos/v1/devices') if rsp is not None: try: device_tbl = dict() for device in rsp['devices']: device['id'] = 'of:' + device['chassisId'].rjust(16, '0') device_tbl[device['id']] = device for id in CONF.onos()['device_list']: if id is '': continue # no config if id in device_tbl: device = device_tbl[id] device['monitor_item'] = True if not device['available']: device_status = 'nok' device_fail_reason.append('Device ' + id + ' DOWN') device_tbl.pop(id) else: device = { 'id': id, 'available': False, 'channelId': '-', 'name': '-', 'role': '-', 'monitor_item': True } device_status = 'nok' device_fail_reason.append('Device ' + id + ' DOWN') device_list.append(device) for device in device_tbl.values(): device['monitor_item'] = False device_list.append(device) except: LOG.exception() LOG.error("\'%s\' ONOS Check Error(devices)", node_ip) device_status = 'fail' else: LOG.error("\'%s\' ONOS Check Error(devices)", node_ip) device_status = 'fail' # check links link_list = [] link_status = 'ok' link_fail_reason = [] ret, rsp = onos_api_req(node_ip, 'onos/v1/links') if rsp is not None: try: link_tbl = dict() for link in rsp['links']: link['src'] = link['src']['device'] + '/' + link['src'][ 'port'] link['dst'] = link['dst']['device'] + '/' + link['dst'][ 'port'] link_tbl[link['src'] + '-' + link['dst']] = link for id in CONF.onos()['link_list']: if id is '': continue if len(id.split('-')) != 2: link = { 'src': id, 'dst': '(invalid_link_config)', 'expected': 'false', 'state': '-', 'type': "-", 'monitor_item': True } link_status = 'nok' link_fail_reason.append( 'Link ' + id + ' is configed as INVALID ID FORMAT') link_list.append(link) continue if id in link_tbl: link = link_tbl[id] link['monitor_item'] = True if link['state'] != 'ACTIVE': link_status = 'nok' link_fail_reason.append('Link ' + id + ' DOWN') link_list.append(link) link_tbl.pop(id) else: link = { 'src': id.split('-')[0], 'dst': id.split('-')[1], 'expected': 'false', 'state': '-', 'type': "-", 'monitor_item': True } link_status = 'nok' link_fail_reason.append('Link ' + id + ' DOWN') link_list.append(link) rev_id = id.split('-')[1] + '-' + id.split('-')[0] if rev_id in link_tbl: link = link_tbl[rev_id] link['monitor_item'] = True if link['state'] != 'ACTIVE': link_status = 'nok' link_fail_reason.append('Link' + rev_id + ' DOWN') link_list.append(link) link_tbl.pop(rev_id) else: link = { 'src': rev_id.split('-')[0], 'dst': rev_id.split('-')[1], 'expected': 'false', 'state': '-', 'type': "-", 'monitor_item': True } link_status = 'nok' link_fail_reason.append('Link ' + rev_id + ' DOWN') link_list.append(link) for link in link_tbl.values(): link['monitor_item'] = False link_list.append(link) except: LOG.exception() LOG.error("\'%s\' ONOS Check Error(links)", node_ip) link_status = 'fail' # check apps app_list = [] app_status = 'ok' app_fail_reason = [] ret, rsp = onos_api_req(node_ip, 'onos/v1/applications') if rsp is not None: try: active_app_list = [] for app_rsp in rsp['applications']: if app_rsp['state'] == 'ACTIVE': active_app_list.append(app_rsp['name'].replace( 'org.onosproject.', '')) for app in CONF.onos()['app_list']: if app in active_app_list: app_json = { 'name': app, 'status': 'ok', 'monitor_item': True } active_app_list.remove(app) else: app_json = { 'name': app, 'status': 'nok', 'monitor_item': True } app_status = 'nok' app_fail_reason.append(app_json) app_list.append(app_json) for app in active_app_list: app_json = { 'name': app, 'status': 'ok', 'monitor_item': False } app_list.append(app_json) except: LOG.exception() LOG.error("\'%s\' ONOS Check Error(apps)", node_ip) app_status = 'fail' else: LOG.error("\'%s\' ONOS Check Error(apps)", node_ip) link_status = 'fail' # store to db try: sql = 'UPDATE ' + DB.ONOS_TBL + \ ' SET ' + \ ' cluster = \"' + str(node_list) + '\",' \ ' device = \"' + str(device_list) + '\",' \ ' link = \"' + str(link_list) + '\",' \ ' app = \"' + str(app_list) + '\"' \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE ONOS CONNECTION INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] ONOS CONNECTION DB Update Fail.') except: LOG.exception() except: LOG.exception() cluster_status = 'fail' device_status = 'fail' link_status = 'fail' app_status = 'fail' return node_status, device_status, link_status, app_status, node_fail_reason, device_fail_reason, link_fail_reason, app_fail_reason