def get_ha_stats(ha_dic): try: ha_status = 'ok' ha_ratio = 'ok' list_reason = [] ratio_reason = [] frontend = 0 backend = 0 for key in dict(ha_dic).keys(): for line in ha_dic[key]: host = dict(line)['name'] status = dict(line)['node_sts'] if host == 'FRONTEND': if not 'OPEN' in status: list_json = { 'key': key, 'hostname': host, 'status': 'nok' } list_reason.append(list_json) ha_status = 'nok' frontend = int(dict(line)['req_count']) else: if not 'UP' in status: list_json = { 'key': key, 'hostname': host, 'status': 'nok' } list_reason.append(list_json) ha_status = 'nok' backend = backend + int(dict(line)['succ_count']) ratio = float(backend) * 100 / frontend if ratio < float(CONF.alarm()['ha_proxy']): ha_ratio = 'nok' ratio_reason.append(str(format(ratio, '.2f'))) except: LOG.exception() ha_status = 'fail' ha_ratio = 'fail' return ha_status, ha_ratio, list_reason, ratio_reason
def get_grade(item, value): critical, major, minor = (CONF.alarm()[item]) if value == '-1': return 'fail' if float(value) >= float(critical): return 'critical' elif float(value) >= float(major): return 'major' elif float(value) >= float(minor): return 'minor' return 'normal'
def get_internal_traffic(conn, db_log, node_name, node_ip, user_name, sub_type, rx_count, patch_tx, pre_stat): try: status = 'ok' in_packet = 0 out_packet = 0 reason_list = [] desc = '' if sub_type == 'COMPUTE': flow_rt = SshCommand.ssh_exec(user_name, node_ip, 'sudo ovs-ofctl -O OpenFlow13 dump-flows br-int') inport_cnt = 0 gw_cnt = 0 output_cnt = 0 if flow_rt is not None: for line in flow_rt.splitlines(): tmp = line.split(',') if 'in_port' in line: inport_cnt = inport_cnt + int(tmp[3].split('=')[1]) elif 'output' in line: output_cnt = output_cnt + int(tmp[3].split('=')[1]) elif 'actions=group' in line: gw_cnt = gw_cnt + int(tmp[3].split('=')[1]) in_packet = inport_cnt + rx_count out_packet = gw_cnt + output_cnt port_json = {'vm_tx': inport_cnt, 'vxlan_rx': rx_count, 'out_gw': gw_cnt, 'output': output_cnt} else: port_json = {'vm_tx': -1, 'vxlan_rx': -1, 'out_gw': -1, 'output': -1} status = 'fail' else: port_json = {'vxlan_rx': rx_count, 'patch-integ': patch_tx} if patch_tx == -1: status = 'fail' else: in_packet = rx_count out_packet = patch_tx for_save_in = in_packet for_save_out = out_packet if not dict(pre_stat).has_key(node_name + '_internal'): status = '-' vxlan_json = {'port_stat_in_out': port_json, 'period': CONF.watchdog()['interval'], 'ratio': 0, 'current_rx': -1, 'current_tx': -1, 'description': desc, 'threshold': CONF.alarm()['internal_traffic_ratio'], 'status': status} elif status == 'ok': in_packet = in_packet - int(dict(pre_stat)[node_name + '_internal']['in_packet']) out_packet = out_packet - int(dict(pre_stat)[node_name + '_internal']['out_packet']) if in_packet == 0 and out_packet == 0: ratio = 100 elif in_packet <= 0 or out_packet < 0: LOG.info('Internal Traffic Ratio Fail.') ratio = 0 else: ratio = float(out_packet) * 100 / in_packet LOG.info('Internal Traffic Ratio = ' + str(ratio)) desc = 'Internal Traffic Ratio = ' + str(ratio) + '(' + str(out_packet) + '/' + str(in_packet) + ')' if ratio < float(CONF.alarm()['internal_traffic_ratio']): status = 'nok' vxlan_json = {'port_stat_in_out': port_json, 'period': CONF.watchdog()['interval'], 'ratio': format(ratio, '.2f'), 'current_rx': in_packet, 'current_tx': out_packet, 'description': desc, 'threshold': CONF.alarm()['internal_traffic_ratio'], 'status': status} in_out_dic = dict() in_out_dic['in_packet'] = for_save_in in_out_dic['out_packet'] = for_save_out pre_stat[node_name + '_internal'] = in_out_dic try: sql = 'UPDATE ' + DB.OPENSTACK_TBL + \ ' SET internal_traffic = \"' + str(vxlan_json) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE INTERNAL TRAFFIC INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] INTERNAL TRAFFIC DB Update Fail.') except: LOG.exception() except: LOG.exception() status = 'fail' if not status == 'ok': reason_list.append(vxlan_json) return status, pre_stat, reason_list
def get_node_traffic(conn, db_log, node_name, rx_dic, tx_dic, total_rx, total_tx, err_info, pre_stat): try: status = 'ok' reason_list = [] pre_total_rx = total_rx pre_total_tx = total_tx # check minimum packet count sql = 'SELECT data_ip FROM ' + DB.OPENSTACK_TBL + ' WHERE nodename = \'' + node_name + '\'' data_ip = conn.cursor().execute(sql).fetchone()[0] sql = 'SELECT ip_addr FROM ' + DB.NODE_INFO_TBL + ' WHERE type = \'ONOS\'' nodes_info = conn.cursor().execute(sql).fetchall() min_rx = 0 if len(nodes_info) == 0: LOG.info('Fail to load onos list') status = 'fail' else: for ip in nodes_info: flows_rt = SshCommand.onos_ssh_exec(ip[0], '\"flows --filter \'{tunnelDst=' + data_ip + '}\' --short\"') if flows_rt is not None: for line in flows_rt.splitlines(): if 'tunnelDst' in line: min_rx = min_rx + int(line.split(',')[2].split('=')[1]) break if not dict(pre_stat).has_key(node_name + '_VXLAN'): status = '-' ratio = -1 else: total_rx = total_rx - int(dict(pre_stat)[node_name + '_VXLAN']['total_rx']) total_tx = total_tx - int(dict(pre_stat)[node_name + '_VXLAN']['total_tx']) cur_min = min_rx - int(dict(pre_stat)[node_name + '_VXLAN']['min_rx']) if total_rx == 0 and total_tx == 0: ratio = 100 elif total_tx <= 0 or total_tx < 0: LOG.info('Node Traffic Ratio Fail.') ratio = 0 else: ratio = float(total_rx) * 100 / total_tx LOG.info('Node Traffic Ratio = ' + str(ratio)) port_json = {'rx': rx_dic[node_name], 'minimum_rx': min_rx, 'rx_drop': err_info['rx_drop'], 'rx_errs': err_info['rx_err'], 'tx': tx_dic[node_name], 'tx_drop': err_info['tx_drop'], 'tx_errs': err_info['tx_err']} description = '' if not status == '-': description = 'Ratio of success for all nodes = ' + str(ratio) + ' (' + str(total_rx) + ' / ' + str(total_tx) + ')' if ratio < float(CONF.alarm()['node_traffic_ratio']): LOG.info('[NODE TRAFFIC] ratio nok') status = 'nok' if total_rx < cur_min: LOG.info('CUR_MIN_RX = ' + str(cur_min) + ', CUR_RX = ' + str(total_rx) + ', Less than rx minimum.') status = 'nok' if err_info['rx_drop'] - int(dict(pre_stat)[node_name + '_VXLAN']['rx_drop']) > 0: LOG.info('[NODE TRAFFIC] rx_drop nok') status = 'nok' if err_info['rx_err'] - int(dict(pre_stat)[node_name + '_VXLAN']['rx_err']) > 0: LOG.info('[NODE TRAFFIC] rx_err nok') status = 'nok' if err_info['tx_drop'] - int(dict(pre_stat)[node_name + '_VXLAN']['tx_drop']) > 0: LOG.info('[NODE TRAFFIC] tx_drop nok') status = 'nok' if err_info['tx_err'] - int(dict(pre_stat)[node_name + '_VXLAN']['tx_err']) > 0: LOG.info('[NODE TRAFFIC] tx_err nok') status = 'nok' in_out_dic = dict() in_out_dic['total_rx'] = pre_total_rx in_out_dic['total_tx'] = pre_total_tx in_out_dic['min_rx'] = min_rx in_out_dic['rx_drop'] = err_info['rx_drop'] in_out_dic['rx_err'] = err_info['rx_err'] in_out_dic['tx_drop'] = err_info['tx_drop'] in_out_dic['tx_err'] = err_info['tx_err'] pre_stat[node_name + '_VXLAN'] = in_out_dic except: LOG.exception() status = 'fail' vxlan_json = {'port_stat_vxlan': port_json, 'period': CONF.watchdog()['interval'], 'ratio': format(ratio, '.2f'), 'current_rx': total_rx, 'current_tx': total_tx, 'description': description, 'threshold': CONF.alarm()['node_traffic_ratio'], 'status': status} try: sql = 'UPDATE ' + DB.OPENSTACK_TBL + \ ' SET vxlan_traffic = \"' + str(vxlan_json) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE VXLAN STAT INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] VXLAN STAT DB Update Fail.') except: LOG.exception() if not status == 'ok': reason_list.append(vxlan_json) return status, pre_stat, reason_list
def get_gw_ratio_compute(conn, db_log, node_ip, node_name, pre_stat): status = 'ok' reason = [] try: sql = 'SELECT ' + DB.ONOS_TBL + '.nodename, nodelist, ip_addr' + ' FROM ' + DB.ONOS_TBL + \ ' INNER JOIN ' + DB.NODE_INFO_TBL + ' ON ' + DB.ONOS_TBL + '.nodename = ' + DB.NODE_INFO_TBL + '.nodename' nodes_info = conn.cursor().execute(sql).fetchall() if len(nodes_info) == 0: LOG.info('Fail to load onos list') return 'fail', pre_stat, reason manage_ip = '' hostname = '' for nodename, nodelist, ip in nodes_info: if not nodelist == 'none': for node_info in eval(nodelist): try: if dict(node_info)['management_ip'] == node_ip: manage_ip = ip hostname = dict(node_info)['hostname'] except: manage_ip = '' if not manage_ip == '': break if not manage_ip == '': break if hostname == '': LOG.info('Can not find hostname') return 'fail', pre_stat, reason try: sql = 'SELECT of_id FROM ' + DB.OPENSTACK_TBL + ' WHERE hostname = \'' + str(hostname) + '\'' LOG.info(sql) node_info = conn.cursor().execute(sql).fetchone() of_id = node_info[0] except: LOG.exception() LOG.info('Can not find of_id') return 'fail', pre_stat, reason group_rt = SshCommand.onos_ssh_exec(manage_ip, 'groups') total_cnt = 0 gw_list = [] if group_rt is not None: for line in group_rt.splitlines(): if of_id in line: tmp = line.split(',') for col in tmp: if 'packets=' in col: total_cnt = total_cnt + int(col.split('=')[1]) gw_list.append(int(col.split('=')[1])) str_ratio = '' if not dict(pre_stat).has_key(node_name + '_GW'): status = '-' json_ratio = {'ratio': '-', 'status': status, 'period':CONF.watchdog()['interval'], 'status': status} else: i = 0 for gw in gw_list: cur_gw = gw - pre_stat[node_name + '_GW']['gw_list'][i] cur_total = total_cnt - pre_stat[node_name + '_GW']['gw_total'] LOG.info('cur_gw = ' + str(cur_gw)) LOG.info('cur_total = ' + str(cur_total)) if cur_gw == 0 and cur_total == 0: ratio = 100/len(gw_list) elif cur_gw <= 0 or cur_total <= 0: ratio = 0 else: ratio = float(cur_gw) * 100 / cur_total i = i + 1 str_ratio = str_ratio + str(ratio) + ':' if ratio < float(CONF.alarm()['gw_ratio']): status = 'nok' json_ratio = {'ratio': str_ratio.rstrip(':'), 'status': status, 'period':CONF.watchdog()['interval'], 'status': status} LOG.info('[COMPUTE] ' + 'GW_RATIO = ' + str_ratio.rstrip(':')) try: sql = 'UPDATE ' + DB.OPENSTACK_TBL + \ ' SET gw_ratio = \"' + str(json_ratio) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE TRAFFIC GW INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] TRAFFIC GW DB Update Fail.') except: LOG.exception() in_out_dic = dict() in_out_dic['gw_list'] = gw_list in_out_dic['gw_total'] = total_cnt pre_stat[node_name + '_GW'] = in_out_dic except: LOG.exception() status = 'fail' if not status == 'ok': reason.append(json_ratio) return status, pre_stat, reason
def get_gw_ratio_gateway(conn, db_log, node_ip, node_name, rx, gw_rx_sum, pre_stat): status = 'ok' reason = [] try: sql = 'SELECT ' + DB.ONOS_TBL + '.nodename, nodelist, ip_addr' + ' FROM ' + DB.ONOS_TBL + \ ' INNER JOIN ' + DB.NODE_INFO_TBL + ' ON ' + DB.ONOS_TBL + '.nodename = ' + DB.NODE_INFO_TBL + '.nodename' nodes_info = conn.cursor().execute(sql).fetchall() if len(nodes_info) == 0: LOG.info('Fail to load onos list') return 'fail', pre_stat, reason # search data_ip data_ip = '' manage_ip = '' cpt_to_gw_packet = 0 for nodename, nodelist, ip in nodes_info: if not nodelist == 'none': for node_info in eval(nodelist): try: if dict(node_info)['management_ip'] == node_ip: manage_ip = ip data_ip = dict(node_info)['data_ip'] except: manage_ip = '' if not manage_ip == '': break if not manage_ip == '': break if data_ip == '': LOG.info('Can not find data ip') return 'fail', pre_stat, reason group_rt = SshCommand.onos_ssh_exec(manage_ip, 'groups') if group_rt is not None: for line in group_rt.splitlines(): if '{tunnelDst=' + data_ip + '}' in line: tmp = line.split(',') for col in tmp: if 'packets=' in col: cpt_to_gw_packet = cpt_to_gw_packet + int(col.split('=')[1]) if not dict(pre_stat).has_key(node_name + '_GW'): status = '-' json_ratio = {'current_rx': '-', 'current_compute_tx': '-', 'current_total': '-', 'ratio': '-', 'period': CONF.watchdog()['interval'], 'status': status, 'packet_loss': False, 'description': ''} else: cur_rx = rx - int(dict(pre_stat)[node_name + '_GW']['rx']) cur_total = gw_rx_sum - int(dict(pre_stat)[node_name + '_GW']['gw_rx_sum']) cur_packet = cpt_to_gw_packet - int(dict(pre_stat)[node_name + '_GW']['cpt_to_gw_packet']) if cur_rx == 0 and cur_total == 0: ratio = 100 elif cur_rx <= 0 or cur_total < 0: ratio = 0 else: ratio = float(cur_rx) * 100 / cur_total desc = 'GW RATIO = ' + str(ratio) + ' (' + str(cur_rx) + ' / ' + str(cur_total) + ')' loss_flag = False if cur_rx < cur_packet: LOG.info('GW Ratio Fail. (Data loss)') loss_flag = True LOG.info('GW Ratio = ' + str(ratio)) if ratio < float(CONF.alarm()['gw_ratio']) or cur_rx < cur_packet: status = 'nok' json_ratio = {'current_rx': cur_rx, 'current_compute_tx': cur_packet, 'current_total': cur_total, 'ratio': format(ratio, '.2f'), 'period':CONF.watchdog()['interval'], 'status': status, 'packet_loss': loss_flag, 'description': desc} try: sql = 'UPDATE ' + DB.OPENSTACK_TBL + \ ' SET gw_ratio = \"' + str(json_ratio) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE TRAFFIC GW INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] TRAFFIC GW DB Update Fail.') except: LOG.exception() in_out_dic = dict() in_out_dic['rx'] = rx in_out_dic['gw_rx_sum'] = gw_rx_sum in_out_dic['cpt_to_gw_packet'] = cpt_to_gw_packet pre_stat[node_name + '_GW'] = in_out_dic except: LOG.exception() status = 'fail' if not status == 'ok': reason.append(json_ratio) return status, pre_stat, reason
def controller_traffic_check(conn, db_log, node_name, node_ip, pre_stat): try: summary_rt = SshCommand.onos_ssh_exec(node_ip, 'summary') in_packet = 0 out_packet = 0 cpman_stat_list = list() controller_traffic = 'ok' reason = [] desc = '' ratio = 0 if summary_rt is not None: data_ip = str(summary_rt).split(',')[0].split('=')[1] try: sql = 'SELECT hostname, of_id FROM ' + DB.OPENSTACK_TBL nodes_info = conn.cursor().execute(sql).fetchall() for hostname, of_id in nodes_info: cmd = 'cpman-stats-list ' + data_ip + ' control_message ' + of_id stat_rt = SshCommand.onos_ssh_exec(node_ip, cmd) rest_json = { 'hostname': str(hostname), 'of_id': str(of_id), 'inbound': '-', 'outbound': '-', 'mod': '-', 'removed': '-', 'request': '-', 'reply': '-' } if stat_rt is not None: if not str(stat_rt).startswith('Failed'): for line in stat_rt.splitlines(): type = line.split(',')[0].split('=')[1] avg_cnt = int(line.split(',')[2].split('=')[1]) if type == 'INBOUND_PACKET': in_packet = in_packet + avg_cnt in_p = avg_cnt elif type == 'OUTBOUND_PACKET': out_packet = out_packet + avg_cnt out_p = avg_cnt elif type == 'FLOW_MOD_PACKET': mod_p = avg_cnt elif type == 'FLOW_REMOVED_PACKET': remove_p = avg_cnt elif type == 'REQUEST_PACKET': req_p = avg_cnt elif type == 'REPLY_PACKET': res_p = avg_cnt rest_json = { 'hostname': str(hostname), 'of_id': str(of_id), 'inbound': in_p, 'outbound': out_p, 'mod': mod_p, 'removed': remove_p, 'request': req_p, 'reply': res_p } else: reason.append(rest_json) controller_traffic = 'fail' else: reason.append(rest_json) controller_traffic = 'fail' cpman_stat_list.append(rest_json) for_save_in = in_packet for_save_out = out_packet if not dict(pre_stat).has_key(node_name): controller_traffic = '-' in_out_dic = dict() in_out_dic['in_packet'] = for_save_in in_out_dic['out_packet'] = for_save_out pre_stat[node_name] = in_out_dic else: in_packet = in_packet - int( dict(pre_stat)[node_name]['in_packet']) out_packet = out_packet - int( dict(pre_stat)[node_name]['out_packet']) if in_packet <= CONF.alarm( )['controller_traffic_minimum_inbound']: desc = 'Minimum increment for status check = ' + str( CONF.alarm()['controller_traffic_minimum_inbound']) controller_traffic = '-' else: if in_packet == 0 and out_packet == 0: ratio = 100 elif in_packet <= 0 or out_packet < 0: LOG.info('Controller Traffic Ratio Fail.') ratio = 0 else: ratio = float(out_packet) * 100 / in_packet LOG.info('[CPMAN][' + node_name + '] Controller Traffic Ratio = ' + str(ratio) + '(' + str(out_packet) + '/' + str(in_packet) + ')') desc = 'Controller Traffic Ratio = ' + str( ratio) + '(' + str(out_packet) + '/' + str( in_packet) + ')\n' if ratio < float( CONF.alarm()['controller_traffic_ratio']): controller_traffic = 'nok' in_out_dic = dict() in_out_dic['in_packet'] = for_save_in in_out_dic['out_packet'] = for_save_out pre_stat[node_name] = in_out_dic except: LOG.exception() controller_traffic = 'fail' else: controller_traffic = 'fail' controller_json = { 'status': controller_traffic, 'stat_list': cpman_stat_list, 'minimum_inbound_packet': CONF.alarm()['controller_traffic_minimum_inbound'], 'current_inbound_packet': in_packet, 'current_outbound_packet': out_packet, 'period': CONF.watchdog()['interval'], 'ratio': format(ratio, '.2f'), 'description': desc, 'threshold': CONF.alarm()['controller_traffic_ratio'] } if not controller_traffic == 'ok': reason.append(controller_json) try: sql = 'UPDATE ' + DB.ONOS_TBL + \ ' SET traffic_stat = \"' + str(controller_json) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE CONTROLLER TRAFFIC INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] CONTROLLER TRAFFIC Update Fail.') except: LOG.exception() except: LOG.exception() controller_traffic = 'fail' return controller_traffic, pre_stat, reason
def periodic(conn, pre_stat, db_log): try: cur_info = {} LOG.info('Periodic checking %s', str(CONF.watchdog()['check_system'])) try: node_list = cmd_proc.get_node_list('all', 'nodename, ip_addr, username, type, sub_type') if not node_list: LOG.info("Not Exist Node data ...") return except: LOG.exception() return # Read cur alarm status sql = 'SELECT nodename, item, grade FROM ' + DB.EVENT_TBL db_log.write_log(sql) cur_grade = conn.cursor().execute(sql).fetchall() for nodename, item, grade in cur_grade: if not cur_info.has_key(nodename): cur_info[nodename] = {} cur_info[nodename][item] = grade # check HA, once if 'HA' in CONF.watchdog()['check_system']: ha_dic = chk_ha.onos_ha_check(conn, db_log) global_ha_svc, global_ha_ratio, global_svc_reason, global_ha_ratio_reason = chk_ha.get_ha_stats(ha_dic) # check GW ratio gw_total = 0 # check node traffic rx_total = 0 tx_total = 0 openstack_rx_dic = dict() openstack_tx_dic = dict() rx_tx_err_info = dict() patch_tx_dic = dict() for node_name, node_ip, user_name, type, sub_type in node_list: if type.upper() == 'OPENSTACK': openstack_rx_dic[node_name], openstack_tx_dic[node_name], rx_tx_err_info[node_name], patch_tx_dic[node_name] = chk_openstack.rx_tx_check(user_name, node_ip) if openstack_rx_dic[node_name] > 0: rx_total = rx_total + openstack_rx_dic[node_name] if openstack_tx_dic[node_name] > 0: tx_total = tx_total + openstack_tx_dic[node_name] if sub_type == 'GATEWAY': if openstack_rx_dic[node_name] > 0: gw_total = gw_total + openstack_rx_dic[node_name] for node_name, node_ip, user_name, type, sub_type in node_list: LOG.info('------------------------------------ ' + node_name + ' START ------------------------------------') cpu = '-1' memory = '-1' disk = '-1' onos_app = 'fail' onos_rest = 'fail' v_router = 'fail' xos_status = 'fail' synchronizer_status = 'fail' swarm_node = 'fail' swarm_svc = 'fail' ha_svc = 'fail' ha_ratio = 'fail' openstack_node = 'fail' onos_of = 'fail' onos_cluster = 'fail' traffic_gw = 'fail' port_stat_vxlan = 'fail' traffic_controller = 'fail' traffic_internal = 'fail' # check ping network = net_check(node_ip) # occur event (rest) # 1. ping check reason = [] if network == 'nok': reason.append('ping transmit failed') network = alarm_event.process_event(conn, db_log, node_name, type, 'NETWORK', cur_info[node_name]['NETWORK'], network, reason) if network == 'ok': if type.upper() == 'ONOS': # check node openstack_node, reason = chk_onos.onos_node_check(conn, db_log, node_name, node_ip) openstack_node = alarm_event.process_event(conn, db_log, node_name, type, 'OPENSTACK_NODE', cur_info[node_name]['OPENSTACK_NODE'], openstack_node, reason) LOG.info('[' + node_name + '][OPENSTACK_NODE][' + openstack_node + ']' + str(reason)) # check app onos_app, reason = chk_onos.onos_app_check(conn, db_log, node_name, node_ip) onos_app = alarm_event.process_event(conn, db_log, node_name, type, 'ONOS_APP', cur_info[node_name]['ONOS_APP'], onos_app, reason) LOG.info('[' + node_name + '][ONOS_APP][' + onos_app + ']' + str(reason)) # check connection onos_of, onos_cluster, of_reason, cluster_reason = chk_onos.onos_conn_check(conn, db_log, node_name, node_ip) onos_of = alarm_event.process_event(conn, db_log, node_name, type, 'ONOS_OPENFLOW', cur_info[node_name]['ONOS_OPENFLOW'], onos_of, of_reason) onos_cluster = alarm_event.process_event(conn, db_log, node_name, type, 'ONOS_CLUSTER', cur_info[node_name]['ONOS_CLUSTER'], onos_cluster, cluster_reason) LOG.info('[' + node_name + '][ONOS_OPENFLOW][' + onos_of + ']' + str(of_reason)) LOG.info('[' + node_name + '][ONOS_CLUSTER][' + onos_cluster + ']' + str(cluster_reason)) # check web onos_rest, reason = chk_onos.onos_rest_check(conn, db_log, node_name, node_ip) onos_rest = alarm_event.process_event(conn, db_log, node_name, type, 'ONOS_REST', cur_info[node_name]['ONOS_REST'], onos_rest, reason) LOG.info('[' + node_name + '][ONOS_REST][' + onos_rest + ']' + str(reason)) # check controller traffic traffic_controller, pre_stat, reason = chk_onos.controller_traffic_check(conn, db_log, node_name, node_ip, pre_stat) traffic_controller = alarm_event.process_event(conn, db_log, node_name, type, 'TRAFFIC_CONTROLLER', cur_info[node_name]['TRAFFIC_CONTROLLER'], traffic_controller, reason) LOG.info('[' + node_name + '][ONOS_TRAFFIC_CONTROLLER][' + traffic_controller + ']' + str(reason)) elif type.upper() == 'HA': ha_svc = global_ha_svc ha_svc = alarm_event.process_event(conn, db_log, node_name, type, 'HA_SVC', cur_info[node_name]['HA_SVC'], ha_svc, global_svc_reason) LOG.info('[' + node_name + '][HA_SVC][' + ha_svc + ']' + str(global_svc_reason)) ha_ratio = global_ha_ratio ha_ratio = alarm_event.process_event(conn, db_log, node_name, type, 'HA_RATIO', cur_info[node_name]['HA_RATIO'], ha_ratio, global_ha_ratio_reason) LOG.info('[' + node_name + '][HA_RATIO][' + ha_ratio + ']' + str(global_ha_ratio_reason)) # check xos (status/synchronizer) elif type.upper() == 'XOS': xos_status, reason = chk_xos.xos_status_check(conn, db_log, node_name) xos_status = alarm_event.process_event(conn, db_log, node_name, type, 'XOS_SVC', cur_info[node_name]['XOS_SVC'], xos_status, reason) LOG.info('[' + node_name + '][XOS_SVC][' + xos_status + ']' + str(reason)) synchronizer_status, reason = chk_xos.xos_sync_check(conn, db_log, node_name) synchronizer_status = alarm_event.process_event(conn, db_log, node_name, type, 'SYNCHRONIZER', cur_info[node_name]['SYNCHRONIZER'], synchronizer_status, reason) LOG.info('[' + node_name + '][SYNCHRONIZER][' + synchronizer_status + ']' + str(reason)) # check swarm (app/node) swarm_manager = chk_swarm.find_swarm_manager() swarm_node, reason = chk_swarm.swarm_node_check(conn, db_log, node_name, user_name, node_ip, swarm_manager) swarm_node = alarm_event.process_event(conn, db_log, node_name, type, 'SWARM_NODE', cur_info[node_name]['SWARM_NODE'], swarm_node, reason) LOG.info('[' + node_name + '][SWARM_NODE][' + swarm_node + ']' + str(reason)) swarm_svc, reason = chk_swarm.swarm_service_check(conn, db_log, node_name, user_name, node_ip, swarm_manager) swarm_svc = alarm_event.process_event(conn, db_log, node_name, type, 'SWARM_SVC', cur_info[node_name]['SWARM_SVC'], swarm_svc, reason) LOG.info('[' + node_name + '][SWARM_SVC][' + swarm_svc + ']' + str(reason)) # add reason #reason = [] #swarm_svc = alarm_event.process_event(conn, db_log, node_name, type, 'SWARM_SVC', #cur_info[node_name]['SWARM_SVC'], swarm_svc, reason) # check vrouter, gw_ratio elif type.upper() == 'OPENSTACK': port_stat_vxlan, pre_stat, reason = chk_openstack.get_node_traffic(conn, db_log, node_name, openstack_rx_dic, openstack_tx_dic, rx_total, tx_total, rx_tx_err_info[node_name], pre_stat) port_stat_vxlan = alarm_event.process_event(conn, db_log, node_name, type, 'PORT_STAT_VXLAN', cur_info[node_name]['PORT_STAT_VXLAN'], port_stat_vxlan, reason) LOG.info('[' + node_name + '][PORT_STAT_VXLAN][' + port_stat_vxlan + ']' + str(reason)) traffic_internal, pre_stat, reason = chk_openstack.get_internal_traffic(conn, db_log, node_name, node_ip, user_name, sub_type, openstack_rx_dic[node_name], patch_tx_dic[node_name], pre_stat) traffic_internal = alarm_event.process_event(conn, db_log, node_name, type, 'TRAFFIC_INTERNAL', cur_info[node_name]['TRAFFIC_INTERNAL'], traffic_internal, reason) LOG.info('[' + node_name + '][TRAFFIC_INTERNAL][' + traffic_internal + ']' + str(reason)) if sub_type.upper() == 'GATEWAY': v_router, reason = chk_openstack.vrouter_check(conn, db_log, node_name, user_name, node_ip) v_router = alarm_event.process_event(conn, db_log, node_name, type, 'GATEWAY', cur_info[node_name]['GATEWAY'], v_router, reason) LOG.info('[' + node_name + '][GATEWAY][' + v_router + ']' + str(reason)) traffic_gw, pre_stat, reason = chk_openstack.get_gw_ratio_gateway(conn, db_log, node_ip, node_name, openstack_rx_dic[node_name], gw_total, pre_stat) elif sub_type.upper() == 'COMPUTE': v_router = '-' traffic_gw, pre_stat, reason = chk_openstack.get_gw_ratio_compute(conn, db_log, node_ip, node_name, pre_stat) traffic_gw = alarm_event.process_event(conn, db_log, node_name, type, 'TRAFFIC_GW', cur_info[node_name]['TRAFFIC_GW'], traffic_gw, reason) LOG.info('[' + node_name + '][TRAFFIC_GW][' + traffic_gw + ']' + str(reason)) # check resource cpu, memory, disk = chk_resource.check_resource(conn, db_log, node_name, user_name, node_ip) reason = [] # 3. resource check (CPU/MEM/DISK) cpu_grade = 'fail' if CONF.alarm().has_key('cpu'): if not alarm_event.is_monitor_item(type, 'CPU'): cpu_grade = '-' else: cpu_grade = alarm_event.get_grade('cpu', cpu) if cur_info[node_name]['CPU'] != cpu_grade: reason_json = {'value' : cpu} reason.append(reason_json) alarm_event.occur_event(conn, db_log, node_name, 'CPU', cur_info[node_name]['CPU'], cpu_grade, reason) LOG.info('[' + node_name + '][CPU][' + cpu_grade + ']' + str(reason)) reason = [] mem_grade = 'fail' if CONF.alarm().has_key('memory'): if not alarm_event.is_monitor_item(type, 'MEMORY'): mem_grade = '-' else: mem_grade = alarm_event.get_grade('memory', memory) if cur_info[node_name]['MEMORY'] != mem_grade: reason_json = {'value': memory} reason.append(reason_json) alarm_event.occur_event(conn, db_log, node_name, 'MEMORY', cur_info[node_name]['MEMORY'], mem_grade, reason) LOG.info('[' + node_name + '][MEMORY][' + mem_grade + ']' + str(reason)) reason = [] disk_grade = 'fail' if CONF.alarm().has_key('disk'): if not alarm_event.is_monitor_item(type, 'DISK'): disk_grade = '-' else: disk_grade = alarm_event.get_grade('disk', disk) if cur_info[node_name]['DISK'] != disk_grade: reason_json = {'value': disk} reason.append(reason_json) alarm_event.occur_event(conn, db_log, node_name, 'DISK', cur_info[node_name]['DISK'], disk_grade, reason) LOG.info('[' + node_name + '][DISK][' + disk_grade + ']' + str(reason)) try: sql = 'UPDATE ' + DB.STATUS_TBL + \ ' SET CPU = \'' + cpu_grade + '\',' + \ ' MEMORY = \'' + mem_grade + '\',' + \ ' DISK = \'' + disk_grade + '\',' + \ ' NETWORK = \'' + network + '\',' + \ ' ONOS_APP = \'' + onos_app + '\',' + \ ' ONOS_REST = \'' + onos_rest + '\',' + \ ' ONOS_OPENFLOW = \'' + onos_of + '\',' + \ ' ONOS_CLUSTER = \'' + onos_cluster + '\',' + \ ' XOS_SVC = \'' + xos_status + '\',' + \ ' SYNCHRONIZER = \'' + synchronizer_status + '\',' + \ ' SWARM_NODE = \'' + swarm_node + '\',' + \ ' OPENSTACK_NODE = \'' + openstack_node + '\',' + \ ' SWARM_SVC = \'' + swarm_svc + '\',' + \ ' GATEWAY = \'' + v_router + '\',' + \ ' HA_SVC = \'' + ha_svc + '\',' + \ ' HA_RATIO = \'' + ha_ratio + '\',' + \ ' TRAFFIC_GW = \'' + traffic_gw + '\',' + \ ' PORT_STAT_VXLAN = \'' + port_stat_vxlan + '\',' + \ ' TRAFFIC_CONTROLLER = \'' + traffic_controller + '\',' + \ ' TRAFFIC_INTERNAL = \'' + traffic_internal + '\',' + \ ' time = \'' + str(datetime.now()) + '\'' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE TOTAL SYSTEM INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] TOTAL SYSTEM INFO DB Update Fail.') except: LOG.exception() except: LOG.exception() return pre_stat
def periodic(conn): cur_info = {} LOG.info("Periodic checking...%s", str(CONF.watchdog()['check_system'])) try: node_list = cmd_proc.get_node_list('all', 'nodename, ip_addr, username') if not node_list: LOG.info("Not Exist Node data ...") return except: LOG.exception() return # Read cur alarm status sql = 'SELECT nodename, item, grade FROM ' + DB.EVENT_TBL LOG.info(sql) cur_grade = conn.cursor().execute(sql).fetchall() for nodename, item, grade in cur_grade: if not cur_info.has_key(nodename): cur_info[nodename] = {} cur_info[nodename][item] = grade for node_name, node_ip, user_name in node_list: ping = net_check(node_ip) app = 'fail' cpu = '-1' mem = '-1' disk = '-1' if ping == 'ok': if node_ip in str(CONF.onos()['list']): app = onos_app_check(node_ip) elif node_ip in str(CONF.xos()['list']): app = xos_app_check(node_ip) elif node_ip in str(CONF.swarm()['list']): app = swarm_app_check(node_ip) elif node_ip in str(CONF.openstack()['list']): app = openstack_app_check(node_ip) cpu = str(resource.get_cpu_usage(user_name, node_ip, True)) mem = str(resource.get_mem_usage(user_name, node_ip, True)) disk = str(resource.get_disk_usage(user_name, node_ip, True)) try: sql = 'UPDATE ' + DB.RESOURCE_TBL + \ ' SET cpu = \'' + cpu + '\',' + \ ' memory = \'' + mem + '\',' + \ ' disk = \'' + disk + '\'' \ ' WHERE nodename = \'' + node_name + '\'' LOG.info('Update Resource info = ' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': LOG.error('DB Update Fail.') except: LOG.exception() # occur event (rest) # 1. ping check if cur_info[node_name]['ping'] != ping: occur_event(conn, node_name, 'ping', cur_info[node_name]['ping'], ping) # 2. app check if cur_info[node_name]['app'] != app: occur_event(conn, node_name, 'app', cur_info[node_name]['app'], app) # 3. resource check (CPU/MEM/DISK) cpu_grade = 'fail' if CONF.alarm().has_key('cpu'): cpu_grade = get_grade('cpu', cpu) if cur_info[node_name]['cpu'] != cpu_grade: occur_event(conn, node_name, 'cpu', cur_info[node_name]['cpu'], cpu_grade) mem_grade = 'fail' if CONF.alarm().has_key('memory'): mem_grade = get_grade('memory', mem) if cur_info[node_name]['memory'] != mem_grade: occur_event(conn, node_name, 'memory', cur_info[node_name]['memory'], mem_grade) disk_grade = 'fail' if CONF.alarm().has_key('disk'): disk_grade = get_grade('disk', disk) if cur_info[node_name]['disk'] != disk_grade: occur_event(conn, node_name, 'disk', cur_info[node_name]['disk'], disk_grade) try: sql = 'UPDATE ' + DB.STATUS_TBL + \ ' SET cpu = \'' + cpu_grade + '\',' + \ ' memory = \'' + mem_grade + '\',' + \ ' disk = \'' + disk_grade + '\',' + \ ' ping = \'' + ping + '\',' + \ ' app = \'' + app + '\',' + \ ' time = \'' + str(datetime.now()) + '\'' + \ ' WHERE nodename = \'' + node_name + '\'' LOG.info('Update Status info = ' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': LOG.error('DB Update Fail.') except: LOG.exception()