def authentication(self): try: if not self.headers.getheader("authorization"): self.wfile.write('No Authorization Header\n') return False else: request_auth = self.headers.getheader("authorization") id_pw_list = CONF.rest()['user_password'] try: request_account = base64.b64decode(str(request_auth).split()[-1]) for id_pw in id_pw_list: if id_pw.strip() == request_account: LOG.info('[REST-SERVER] AUTH SUCCESS = %s, from %s', id_pw, self.client_address) return True except: LOG.exception() self.wfile.write('Request Authentication User ID or Password is Wrong \n') LOG.info('[REST-SERVER] AUTH FAIL = %s, from %s', base64.b64decode(str(request_auth).split()[-1]), self.client_address) return False except: LOG.exception() return False
def stop(self): try: pid = self.get_pid() except IOError: pid = None if not pid: message = "pidfile %s does not exist. Daemon not running?\n" sys.stderr.write(message % self.pidfile) return # not an error in a restart # Try killing the daemon process try: LOG.info("--- Daemon STOP ---") while 1: for cpid in self.get_child_pid(pid): os.kill(cpid, SIGTERM) os.kill(pid, SIGTERM) time.sleep(0.1) except OSError, err: err = str(err) if err.find("No such process") > 0: if os.path.exists(self.pidfile): os.remove(self.pidfile) else: print str(err) print "Stopping Fail ..." sys.exit(1)
def get_service_list(): service_list = [] try: url = CONF.xos()['xos_rest_server'] account = CONF.xos()['xos_rest_account'] cmd = 'curl -H "Accept: application/json; indent=4" -u ' + account + ' -X GET ' + url + '/api/core/instances/' result = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) output, error = result.communicate() if result.returncode != 0: LOG.error("Cmd Fail, cause => %s", error) return '' instance_array = json.loads(output) for instance_info in instance_array: name = instance_info['instance_name'] LOG.info('swarm_instance_name = ' + name) service_list.append(name) except: LOG.exception() return service_list
def get_disk_usage(username, node_ip, only_value=False): try: cmd = 'sudo df -h / | grep -v Filesystem' cmd_rt = SshCommand.ssh_exec(username, node_ip, cmd) ratio = float() if cmd_rt is None: LOG.info("%s Diksk check Fail", node_ip) if only_value: return -1 return {'DISK': 'Command fail'} else: if '/' in cmd_rt: try: ratio = float(cmd_rt.split()[-2].replace('%', '')) except: LOG.exception() result = { 'DISK': { 'RATIO': float(format(ratio, '.2f')), 'Description': cmd_rt } } LOG.info(" > DISK : %s", str(format(ratio, '.2f'))) if only_value: return float(format(ratio, '.2f')) return result except: LOG.exception() return -1
def xos_status_check(conn, db_log, node_name): xos_status = 'ok' xos_list = [] fail_reason = [] try: url = CONF.xos()['xos_rest_server'] account = CONF.xos()['xos_rest_account'] cmd = 'curl -H "Accept: application/json; indent=4" -u ' + account + ' -X GET ' + url + '/api/core/xoses/' result = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) output, error = result.communicate() if result.returncode != 0: LOG.error("Cmd Fail, cause => %s", error) return 'fail', None xos_array = json.loads(output) for xos_info in xos_array: backend_status = xos_info['backend_status'] LOG.info('xos_status_backend_status = ' + backend_status) tmp = str(backend_status).split('-') if tmp[0].strip() == '0': status = 'ok' else: status = 'nok' xos_json = { 'name': xos_info['name'], 'status': status, 'description': tmp[1].strip() } xos_list.append(xos_json) if status == 'nok': xos_status = 'nok' fail_reason.append(xos_json) try: sql = 'UPDATE ' + DB.XOS_TBL + \ ' SET xos_status = \"' + str(xos_list) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE XOS STATUS INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] XOS STATUS DB Update Fail.') except: LOG.exception() except: LOG.exception() xos_status = 'fail' return xos_status, fail_reason
def auth_pw(self, cli_pw): id_pw_list = CONF.rest()['user_password'] cli_pw = base64.b64decode(cli_pw) for id_pw in id_pw_list: if id_pw.strip() == cli_pw: LOG.info('[REST-SERVER] AUTH SUCCESS = ' + id_pw) return True LOG.info('[REST-SERVER] AUTH FAIL = ' + cli_pw) return False
def proc_dis_resource(node, param): res_result = dict() nodes_info = get_node_list(node, 'nodename, ' + param, DB.RESOURCE_TBL) LOG.info('*****' + str(nodes_info)) for nodename, value in nodes_info: if value < 0: res_result[nodename] = 'FAIL' else: res_result[nodename] = value return res_result
def occur_event(conn, node_name, item, pre_value, cur_value): time = str(datetime.now()) desc = pre_value + ' -> ' + cur_value sql = 'UPDATE ' + DB.EVENT_TBL + \ ' SET grade = \'' + cur_value + '\'' + ',' + \ ' desc = \'' + desc + '\'' + ',' + \ ' time = \'' + time + '\'' + \ ' WHERE nodename = \'' + node_name + '\' and item = \'' + item + '\'' LOG.info('Update alarm info = ' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': LOG.error('DB Update Fail.') push_event(node_name, item, cur_value, desc, time)
def get_resource_usage(node_list, param): res_result = dict() for node_name, node_ip, username, ping in node_list: res_result[node_name] = {} if ping.lower() == 'ok': LOG.info("GET %s usage for %s", param, node_name) res_result[node_name] = PARAM_MAP[param](username, node_ip) else: LOG.info("Can not get %s Usage... %s Network is NOK", param, node_name) res_result[node_name] = 'Net fail' return res_result
def send_response_traffic_test_old(cond, auth): trace_result_data = {} try: is_success, result = trace.traffic_test_old(cond) if is_success: trace_result_data['result'] = 'SUCCESS' else: trace_result_data['result'] = 'FAIL' # trace_result_data['fail_reason'] = 'The source ip does not exist.' if result != None: trace_result_data['traffic_test_result'] = result trace_result_data['transaction_id'] = cond['transaction_id'] try: LOG.info('%s', json.dumps(trace_result_data, sort_keys=True, indent=4)) except: pass req_body_json = json.dumps(trace_result_data) try: url = str(cond['app_rest_url']) #requests.post(str(url), headers=header, data=req_body_json, timeout=2) if str(auth).startswith('Basic '): auth = str(auth).split(' ')[1] cmd = 'curl -X POST -u \'' + CONF.onos( )['rest_auth'] + '\' -H \'Content-Type: application/json\' -d \'' + str( req_body_json) + '\' ' + url LOG.error('%s', 'curl = ' + cmd) result = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) result.communicate() if result.returncode != 0: # Push noti does not respond pass except: LOG.exception() pass except: LOG.exception()
def exit(self): try: pf = file(PIDFILE, 'r') pid = int(pf.read().strip()) pf.close() LOG.info("--- Daemon STOP [fail to check rest server] ---") try: LOG.info('PID = ' + str(pid)) os.killpg(pid, SIGTERM) except OSError, err: err = str(err) if err.find("No such process") > 0: if os.path.exists(self.pidfile): os.remove(self.pidfile) except: LOG.exception()
def get_cpu_usage(username, node_ip, only_value=False): cmd = 'grep \'cpu\ \' /proc/stat' cmd_rt = SshCommand.ssh_exec(username, node_ip, cmd) ratio = float() if cmd_rt is None: LOG.info("%s CPU check Fail", node_ip) if only_value: return -1 return {'CPU': 'Command fail'} else: if 'cpu ' in cmd_rt: LOG.info("cmd_rt: %s", cmd_rt) try: f = cmd_rt.split() ratio = (float(f[1]) + float(f[3])) * 100 / \ (float(f[1]) + float(f[3]) + float(f[4])) except: LOG.exception() result = { 'CPU': { 'RATIO': float(format(ratio, '.2f')), 'Description': cmd_rt } } LOG.info(" CPU check ... %s", result) if only_value: return float(format(ratio, '.2f')) return result
def get_content(self): if not self.headers.getheader('content-length'): self.do_HEAD(400) self.wfile.write( str({ "result": "FAIL", "fail_reason": "Bad Request, Content Length is 0\n" })) LOG.info('[Data Check] Received No Data from %s', self.client_address) return False else: try: receive_data = json.loads( self.rfile.read( int(self.headers.getheader("content-length")))) LOG.info( '%s', '[Received Data] \n' + json.dumps(receive_data, sort_keys=True, indent=4)) return receive_data except: LOG.exception() error_reason = 'Json Data Parsing Error\n' self.do_HEAD(400) self.wfile.write( str({ "result": "FAIL", "fail_reason": error_reason })) LOG.info('[Check Content] %s', error_reason) return False
def get_mem_usage(username, node_ip, only_value=False): cmd = 'free -t -m | grep Mem' cmd_rt = SshCommand.ssh_exec(username, node_ip, cmd) ratio = float() if cmd_rt is None: LOG.info("%s Memory check Fail", node_ip) if only_value: return -1 return {'MEMORY': 'Command fail'} else: if 'Mem' in cmd_rt: LOG.info("cmd_rt %s", cmd_rt) try: f = cmd_rt.split() ratio = float(f[2]) * 100 / float(f[1]) except: LOG.exception() result = { 'MEMORY': { 'RATIO': float(format(ratio, '.2f')), 'Description': cmd_rt } } LOG.info(" Memory check ... %s", result) if only_value: return float(format(ratio, '.2f')) return result
def find_swarm_manager(): hostname = '' try: url = CONF.xos()['xos_rest_server'] account = CONF.xos()['xos_rest_account'] cmd = 'curl -H "Accept: application/json; indent=4" -u ' + account + ' -X GET ' + url + '/api/core/controllers/' result = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) output, error = result.communicate() if result.returncode != 0: LOG.error("Cmd Fail, cause => %s", error) return '' controller_array = json.loads(output) for controller_info in controller_array: auth_url = controller_info['auth_url'] ''' backend_status = controller_info['backend_status'] LOG.info('xos_sync_backend_status = ' + backend_status) tmp = str(backend_status).split('-') if tmp[0].strip() == '0': ''' LOG.info('swarm_manager_auth_url = ' + auth_url) tmp = str(auth_url).split(':') hostname = tmp[0] break except: LOG.exception() return hostname
def run(self): # DB initiation DB.db_initiation() # Start RESTful server try: REST_SVR.rest_server_start() except: print 'Rest Server failed to start' LOG.exception() sys.exit(1) # Periodic monitoring if CONF.watchdog()['interval'] == 0: LOG.info("--- Not running periodic monitoring ---") while True: time.sleep(3600) else: LOG.info("--- Periodic Monitoring Start ---") conn = DB.connection() while True: try: watchdog.periodic(conn) time.sleep(CONF.watchdog()['interval']) except: watchdog.push_event('sonawatcher', 'disconnect', 'critical', 'sonawatcher server shutdown', str(datetime.now())) conn.close() LOG.exception() sys.exit(1)
def tperf_test_run(perf_conditions): tperf_result = dict() request_headers = { 'Authorization': CONF.onos()['rest_auth'], 'Accept': 'application/json', 'Content-Type': 'application/json' } try: # 1. creeate instance LOG.info("[T-perf server/client VM create] --- ") server_vm, client_vm, client_floatingip = traffic_test.create_instance( perf_conditions['server'], perf_conditions['client']) # 2. run performance test if server_vm and client_vm: tperf_result = traffic_test.tperf_command_exec( server_vm.__dict__['addresses'].values()[0][0]['addr'], client_floatingip.ip, perf_conditions['test_options']) else: tperf_result.update({ 'result': 'FAIL', 'fail_reason': 'Fail to create instance.' }) tperf_result.update( {'transaction_id': perf_conditions['transaction_id']}) LOG.info("[Traffic Performance Test] Return Result = %s", json.dumps(tperf_result)) # send tperf test result to ONOS response = requests.post(perf_conditions['app_rest_url'], data=str(json.dumps(tperf_result)), headers=request_headers) LOG.info("[Tperf Result Send] Response = %s %s", response.status_code, response.reason) # delete tperf test instance traffic_test.delete_test_instance(server_vm, client_vm, client_floatingip) except: LOG.exception()
def do_GET(self): request_sz = int(self.headers["Content-length"]) request_str = self.rfile.read(request_sz) request_obj = json.loads(request_str) LOG.info('[REST-SERVER] CLIENT INFO = ' + str(self.client_address)) LOG.info('[REST-SERVER] RECV BODY = \n' + json.dumps(request_obj, sort_keys=True, indent=4)) if self.headers.getheader('Authorization') is None: self.do_HEAD(401) self.wfile.write('no auth header received') LOG.info('[REST-SERVER] no auth header received') elif self.auth_pw(self.headers.getheader('Authorization')): if self.path.startswith('/command'): if command.exist_command(request_obj): res_body = command.parse_command(request_obj) self.do_HEAD(200) self.wfile.write(json.dumps(res_body)) LOG.info('[REST-SERVER] RES BODY = \n%s', json.dumps(res_body, sort_keys=True, indent=4)) else: self.do_HEAD(404) self.wfile.write('command not found') LOG.info('[REST-SERVER] ' + 'command not found') elif self.path.startswith('/regi'): try: self.do_HEAD(200) url = 'http://' + self.client_address[ 0] + ':' + request_obj['port'] + '/' + str( request_obj['uri']) res_body = command.regi_url( url, self.headers.getheader('Authorization')) self.wfile.write(json.dumps(res_body)) LOG.info('[REST-SERVER] RES BODY = \n%s', json.dumps(res_body, sort_keys=True, indent=4)) except: LOG.exception() elif self.path.startswith('/unregi'): try: self.do_HEAD(200) url = 'http://' + self.client_address[ 0] + ':' + request_obj['port'] + '/' + str( request_obj['uri']) res_body = command.unregi_url(url) self.wfile.write(json.dumps(res_body)) LOG.info('[REST-SERVER] RES BODY = \n%s', json.dumps(res_body, sort_keys=True, indent=4)) except: LOG.exception() else: self.do_HEAD(404) self.wfile.write(self.path + ' not found') LOG.info('[REST-SERVER] ' + self.path + ' not found') else: self.do_HEAD(401) self.wfile.write(self.headers.getheader('Authorization')) self.wfile.write('not authenticated') LOG.info('[REST-SERVER] not authenticated')
def periodic(conn, pre_stat, db_log): try: cur_info = {} #LOG.info('Periodic checking %s', str(CONF.watchdog()['check_system'])) try: node_list = cmd_proc.get_node_list('all', 'nodename, ip_addr, username, type, sub_type') if not node_list: LOG.info("Not Exist Node data ...") return except: LOG.exception() return # Read cur alarm status sql = 'SELECT nodename, item, grade FROM ' + DB.EVENT_TBL db_log.write_log(sql) cur_grade = conn.cursor().execute(sql).fetchall() old_nok_count = 0; for nodename, item, grade in cur_grade: if not cur_info.has_key(nodename): cur_info[nodename] = {} cur_info[nodename][item] = grade if grade != 'ok': old_nok_count += 1 new_nok_count = 0; for node_name, node_ip, user_name, type, sub_type in node_list: #LOG.info('------------------------------------ ' + node_name + ' START ------------------------------------') onos_cluster = 'fail' onos_device = 'fail' onos_link = 'fail' onos_app = 'fail' # ping check ping = net_check(node_ip) ping_reason = [] if ping != 'ok': reason.append('ping check failed on ' + node_ip) new_nok_count += 1 ping = alarm_event.process_event(conn, db_log, node_name, type, 'PING', cur_info[node_name]['PING'], ping, ping_reason) if ping == 'ok': if type.upper() == 'ONOS': # check connection onos_cluster, onos_device, onos_link, onos_app, cluster_reason, device_reason, link_reason, app_reason = chk_onos.onos_check(conn, db_log, node_name, node_ip) onos_cluster = alarm_event.process_event(conn, db_log, node_name, type, 'ONOS_CLUSTER', cur_info[node_name]['ONOS_CLUSTER'], onos_cluster, cluster_reason) onos_device = alarm_event.process_event(conn, db_log, node_name, type, 'ONOS_DEVICE', cur_info[node_name]['ONOS_DEVICE'], onos_device, device_reason) onos_link = alarm_event.process_event(conn, db_log, node_name, type, 'ONOS_LINK', cur_info[node_name]['ONOS_LINK'], onos_link, link_reason) onos_app = alarm_event.process_event(conn, db_log, node_name, type, 'ONOS_APP', cur_info[node_name]['ONOS_APP'], onos_app, app_reason) if onos_cluster != 'ok': new_nok_count += 1 if onos_device != 'ok': new_nok_count += 1 if onos_link != 'ok': new_nok_count += 1 if onos_app != 'ok': new_nok_count += 1 try: sql = 'UPDATE ' + DB.STATUS_TBL + \ ' SET' + \ ' PING = \'' + ping + '\',' + \ ' ONOS_CLUSTER = \'' + onos_cluster + '\',' + \ ' ONOS_DEVICE = \'' + onos_device + '\',' + \ ' ONOS_LINK = \'' + onos_link + '\',' + \ ' ONOS_APP = \'' + onos_app + '\',' + \ ' time = \'' + str(datetime.now()) + '\'' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE TOTAL SYSTEM INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] TOTAL SYSTEM INFO DB Update Fail.') except: LOG.exception() # do not version log on everthing is ok if old_nok_count > 0: LOG.info('chk_onos[%s]: ping=%s cluster=%s device=%s link=%s app=%s' % (node_name, ping, onos_cluster, onos_device, onos_link, onos_app)) if old_nok_count > 0 and new_nok_count == 0: alarm_event.process_event(conn, db_log, 'ALL', 'SITE', 'STATUS', 'none', 'ok', []) # send all alarm messages pending alarm_event.flush_event_alarm(); except: LOG.exception() return pre_stat
def do_GET(self): # health check if self.path.startswith('/alive-check'): self.do_HEAD(200) self.wfile.write('ok\n') return if not self.authentication(): self.do_HEAD(401) return else: if not self.headers.getheader('Content-Length'): self.do_HEAD(400) self.wfile.write('Bad Request, Content Length is 0\n') return else: request_size = int(self.headers.getheader("Content-Length")) request_string = self.rfile.read(request_size) request_obj = json.loads(request_string) LOG.info('[REST-SERVER] CLIENT INFO = ' + str(self.client_address)) LOG.info('[REST-SERVER] RECV BODY = \n' + json.dumps(request_obj, sort_keys=True, indent=4)) if self.path.startswith('/command'): try: if command.exist_command(request_obj): res_body = command.parse_command(request_obj) self.do_HEAD(200) self.wfile.write(json.dumps(res_body)) LOG.info('[REST-SERVER] RES BODY = \n%s', json.dumps(res_body, sort_keys=True, indent=4)) else: self.do_HEAD(404) self.wfile.write('command not found') LOG.info('[REST-SERVER] ' + 'command not found') except: LOG.exception() elif self.path.startswith('/regi'): try: self.do_HEAD(200) url = str(request_obj['url']) res_body = command.regi_url(url, self.headers.getheader('Authorization')) self.wfile.write(json.dumps(res_body)) LOG.info('[REST-SERVER] RES BODY = \n%s', json.dumps(res_body, sort_keys=True, indent=4)) except: LOG.exception() elif self.path.startswith('/event_list'): try: self.do_HEAD(200) url = str(request_obj['url']) res_body = command.get_event_list(url, self.headers.getheader('Authorization')) self.wfile.write(json.dumps(res_body)) LOG.info('[REST-SERVER] RES BODY = \n%s', json.dumps(res_body, sort_keys=True, indent=4)) except: LOG.exception() elif self.path.startswith('/unregi'): try: self.do_HEAD(200) url = str(request_obj['url']) res_body = command.unregi_url(url) self.wfile.write(json.dumps(res_body)) LOG.info('[REST-SERVER] RES BODY = \n%s', json.dumps(res_body, sort_keys=True, indent=4)) except: LOG.exception() else: self.do_HEAD(404) self.wfile.write(self.path + ' not found\n') LOG.info('[REST-SERVER] ' + self.path + ' not found')
def do_HEAD(self, res_code): self.send_response(res_code) self.send_header('Content-type', 'application/json') self.end_headers() if res_code != 200: LOG.info('[REST-SERVER] RESPONSE CODE = ' + str(res_code))
def get_internal_traffic(conn, db_log, node_name, node_ip, user_name, sub_type, rx_count, patch_tx, pre_stat): try: status = 'ok' in_packet = 0 out_packet = 0 reason_list = [] desc = '' if sub_type == 'COMPUTE': flow_rt = SshCommand.ssh_exec(user_name, node_ip, 'sudo ovs-ofctl -O OpenFlow13 dump-flows br-int') inport_cnt = 0 gw_cnt = 0 output_cnt = 0 if flow_rt is not None: for line in flow_rt.splitlines(): tmp = line.split(',') if 'in_port' in line: inport_cnt = inport_cnt + int(tmp[3].split('=')[1]) elif 'output' in line: output_cnt = output_cnt + int(tmp[3].split('=')[1]) elif 'actions=group' in line: gw_cnt = gw_cnt + int(tmp[3].split('=')[1]) in_packet = inport_cnt + rx_count out_packet = gw_cnt + output_cnt port_json = {'vm_tx': inport_cnt, 'vxlan_rx': rx_count, 'out_gw': gw_cnt, 'output': output_cnt} else: port_json = {'vm_tx': -1, 'vxlan_rx': -1, 'out_gw': -1, 'output': -1} status = 'fail' else: port_json = {'vxlan_rx': rx_count, 'patch-integ': patch_tx} if patch_tx == -1: status = 'fail' else: in_packet = rx_count out_packet = patch_tx for_save_in = in_packet for_save_out = out_packet if not dict(pre_stat).has_key(node_name + '_internal'): status = '-' vxlan_json = {'port_stat_in_out': port_json, 'period': CONF.watchdog()['interval'], 'ratio': 0, 'current_rx': -1, 'current_tx': -1, 'description': desc, 'threshold': CONF.alarm()['internal_traffic_ratio'], 'status': status} elif status == 'ok': in_packet = in_packet - int(dict(pre_stat)[node_name + '_internal']['in_packet']) out_packet = out_packet - int(dict(pre_stat)[node_name + '_internal']['out_packet']) if in_packet == 0 and out_packet == 0: ratio = 100 elif in_packet <= 0 or out_packet < 0: LOG.info('Internal Traffic Ratio Fail.') ratio = 0 else: ratio = float(out_packet) * 100 / in_packet LOG.info('Internal Traffic Ratio = ' + str(ratio)) desc = 'Internal Traffic Ratio = ' + str(ratio) + '(' + str(out_packet) + '/' + str(in_packet) + ')' if ratio < float(CONF.alarm()['internal_traffic_ratio']): status = 'nok' vxlan_json = {'port_stat_in_out': port_json, 'period': CONF.watchdog()['interval'], 'ratio': format(ratio, '.2f'), 'current_rx': in_packet, 'current_tx': out_packet, 'description': desc, 'threshold': CONF.alarm()['internal_traffic_ratio'], 'status': status} in_out_dic = dict() in_out_dic['in_packet'] = for_save_in in_out_dic['out_packet'] = for_save_out pre_stat[node_name + '_internal'] = in_out_dic try: sql = 'UPDATE ' + DB.OPENSTACK_TBL + \ ' SET internal_traffic = \"' + str(vxlan_json) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE INTERNAL TRAFFIC INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] INTERNAL TRAFFIC DB Update Fail.') except: LOG.exception() except: LOG.exception() status = 'fail' if not status == 'ok': reason_list.append(vxlan_json) return status, pre_stat, reason_list
def xos_sync_check(conn, db_log, node_name): swarm_sync = 'ok' sync_list = [] fail_reason = [] try: url = CONF.xos()['xos_rest_server'] account = CONF.xos()['xos_rest_account'] cmd = 'curl -H "Accept: application/json; indent=4" -u ' + account + ' -X GET ' + url + '/api/core/diags/' result = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) output, error = result.communicate() if result.returncode != 0: LOG.error("Cmd Fail, cause => %s", error) return 'fail', None sync_array = json.loads(output) for xos_info in sync_array: backend_status = xos_info['backend_status'] LOG.info('xos_sync_backend_status = ' + backend_status) tmp = str(backend_status).split('-') if tmp[0].strip() in ['0', '1']: status = 'ok' else: status = 'nok' # check time last_time = json.loads(xos_info['backend_register'])['last_run'] cur_time = time.time() interval = cur_time - last_time interval = int(interval) if interval >= 30: status = 'nok' xos_json = { 'name': xos_info['name'], 'status': status, 'description': tmp[1].strip(), 'last_run_interval': interval } sync_list.append(xos_json) if status == 'nok': swarm_sync = 'nok' fail_reason.append(xos_json) try: sql = 'UPDATE ' + DB.XOS_TBL + \ ' SET synchronizer = \"' + str(sync_list) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE SYNCHRONIZER INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] SYNCHRONIZER DB Update Fail.') except: LOG.exception() except: LOG.exception() swarm_sync = 'fail' return swarm_sync, fail_reason
def run(self): db_log = USER_LOG() db_log.set_log('db.log', CONF.base()['log_rotate_time'], CONF.base()['log_backup_count']) pre_stat = dict() # DB initiation DB.db_initiation(db_log) # Start RESTful server try: REST_SVR.rest_server_start() except: print 'Rest Server failed to start' LOG.exception() self.exit() # Periodic monitoring if CONF.watchdog()['interval'] == 0: LOG.info("--- Not running periodic monitoring ---") while True: time.sleep(3600) else: LOG.info("--- Periodic Monitoring Start ---") history_log.write_log("--- Event History Start ---") conn = DB.connection() exitFlag = False while True: try: i = 0 while i < 3: i = i + 1 # check rest server try: url = 'http://' + socket.gethostbyname( socket.gethostname()) + ':' + str(CONF.rest( )['rest_server_port']) + '/alive-check' cmd = 'curl -X GET \"' + url + '\"' LOG.info('cmd = ' + cmd) result = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) output, error = result.communicate() if result.returncode != 0: LOG.info('REST SERVER CHECK FAIL [' + str(i) + ']') if i == 3: LOG.info('fail to check rest server.') alarm_event.push_event( 'sonawatcher', 'SONAWATCHER_DISCONNECT', 'critical', 'normal', 'sonawatcher server shutdown', str(datetime.now())) conn.close() exitFlag = True self.exit() break else: break except: LOG.exception() if exitFlag: break pre_stat = watchdog.periodic(conn, pre_stat, db_log) time.sleep(CONF.watchdog()['interval']) except: alarm_event.push_event('sonawatcher', 'SONAWATCHER_DISCONNECT', 'critical', 'normal', 'sonawatcher server shutdown', str(datetime.now())) conn.close() LOG.exception()
def get_gw_ratio_gateway(conn, db_log, node_ip, node_name, rx, gw_rx_sum, pre_stat): status = 'ok' reason = [] try: sql = 'SELECT ' + DB.ONOS_TBL + '.nodename, nodelist, ip_addr' + ' FROM ' + DB.ONOS_TBL + \ ' INNER JOIN ' + DB.NODE_INFO_TBL + ' ON ' + DB.ONOS_TBL + '.nodename = ' + DB.NODE_INFO_TBL + '.nodename' nodes_info = conn.cursor().execute(sql).fetchall() if len(nodes_info) == 0: LOG.info('Fail to load onos list') return 'fail', pre_stat, reason # search data_ip data_ip = '' manage_ip = '' cpt_to_gw_packet = 0 for nodename, nodelist, ip in nodes_info: if not nodelist == 'none': for node_info in eval(nodelist): try: if dict(node_info)['management_ip'] == node_ip: manage_ip = ip data_ip = dict(node_info)['data_ip'] except: manage_ip = '' if not manage_ip == '': break if not manage_ip == '': break if data_ip == '': LOG.info('Can not find data ip') return 'fail', pre_stat, reason group_rt = SshCommand.onos_ssh_exec(manage_ip, 'groups') if group_rt is not None: for line in group_rt.splitlines(): if '{tunnelDst=' + data_ip + '}' in line: tmp = line.split(',') for col in tmp: if 'packets=' in col: cpt_to_gw_packet = cpt_to_gw_packet + int(col.split('=')[1]) if not dict(pre_stat).has_key(node_name + '_GW'): status = '-' json_ratio = {'current_rx': '-', 'current_compute_tx': '-', 'current_total': '-', 'ratio': '-', 'period': CONF.watchdog()['interval'], 'status': status, 'packet_loss': False, 'description': ''} else: cur_rx = rx - int(dict(pre_stat)[node_name + '_GW']['rx']) cur_total = gw_rx_sum - int(dict(pre_stat)[node_name + '_GW']['gw_rx_sum']) cur_packet = cpt_to_gw_packet - int(dict(pre_stat)[node_name + '_GW']['cpt_to_gw_packet']) if cur_rx == 0 and cur_total == 0: ratio = 100 elif cur_rx <= 0 or cur_total < 0: ratio = 0 else: ratio = float(cur_rx) * 100 / cur_total desc = 'GW RATIO = ' + str(ratio) + ' (' + str(cur_rx) + ' / ' + str(cur_total) + ')' loss_flag = False if cur_rx < cur_packet: LOG.info('GW Ratio Fail. (Data loss)') loss_flag = True LOG.info('GW Ratio = ' + str(ratio)) if ratio < float(CONF.alarm()['gw_ratio']) or cur_rx < cur_packet: status = 'nok' json_ratio = {'current_rx': cur_rx, 'current_compute_tx': cur_packet, 'current_total': cur_total, 'ratio': format(ratio, '.2f'), 'period':CONF.watchdog()['interval'], 'status': status, 'packet_loss': loss_flag, 'description': desc} try: sql = 'UPDATE ' + DB.OPENSTACK_TBL + \ ' SET gw_ratio = \"' + str(json_ratio) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE TRAFFIC GW INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] TRAFFIC GW DB Update Fail.') except: LOG.exception() in_out_dic = dict() in_out_dic['rx'] = rx in_out_dic['gw_rx_sum'] = gw_rx_sum in_out_dic['cpt_to_gw_packet'] = cpt_to_gw_packet pre_stat[node_name + '_GW'] = in_out_dic except: LOG.exception() status = 'fail' if not status == 'ok': reason.append(json_ratio) return status, pre_stat, reason
def get_gw_ratio_compute(conn, db_log, node_ip, node_name, pre_stat): status = 'ok' reason = [] try: sql = 'SELECT ' + DB.ONOS_TBL + '.nodename, nodelist, ip_addr' + ' FROM ' + DB.ONOS_TBL + \ ' INNER JOIN ' + DB.NODE_INFO_TBL + ' ON ' + DB.ONOS_TBL + '.nodename = ' + DB.NODE_INFO_TBL + '.nodename' nodes_info = conn.cursor().execute(sql).fetchall() if len(nodes_info) == 0: LOG.info('Fail to load onos list') return 'fail', pre_stat, reason manage_ip = '' hostname = '' for nodename, nodelist, ip in nodes_info: if not nodelist == 'none': for node_info in eval(nodelist): try: if dict(node_info)['management_ip'] == node_ip: manage_ip = ip hostname = dict(node_info)['hostname'] except: manage_ip = '' if not manage_ip == '': break if not manage_ip == '': break if hostname == '': LOG.info('Can not find hostname') return 'fail', pre_stat, reason try: sql = 'SELECT of_id FROM ' + DB.OPENSTACK_TBL + ' WHERE hostname = \'' + str(hostname) + '\'' LOG.info(sql) node_info = conn.cursor().execute(sql).fetchone() of_id = node_info[0] except: LOG.exception() LOG.info('Can not find of_id') return 'fail', pre_stat, reason group_rt = SshCommand.onos_ssh_exec(manage_ip, 'groups') total_cnt = 0 gw_list = [] if group_rt is not None: for line in group_rt.splitlines(): if of_id in line: tmp = line.split(',') for col in tmp: if 'packets=' in col: total_cnt = total_cnt + int(col.split('=')[1]) gw_list.append(int(col.split('=')[1])) str_ratio = '' if not dict(pre_stat).has_key(node_name + '_GW'): status = '-' json_ratio = {'ratio': '-', 'status': status, 'period':CONF.watchdog()['interval'], 'status': status} else: i = 0 for gw in gw_list: cur_gw = gw - pre_stat[node_name + '_GW']['gw_list'][i] cur_total = total_cnt - pre_stat[node_name + '_GW']['gw_total'] LOG.info('cur_gw = ' + str(cur_gw)) LOG.info('cur_total = ' + str(cur_total)) if cur_gw == 0 and cur_total == 0: ratio = 100/len(gw_list) elif cur_gw <= 0 or cur_total <= 0: ratio = 0 else: ratio = float(cur_gw) * 100 / cur_total i = i + 1 str_ratio = str_ratio + str(ratio) + ':' if ratio < float(CONF.alarm()['gw_ratio']): status = 'nok' json_ratio = {'ratio': str_ratio.rstrip(':'), 'status': status, 'period':CONF.watchdog()['interval'], 'status': status} LOG.info('[COMPUTE] ' + 'GW_RATIO = ' + str_ratio.rstrip(':')) try: sql = 'UPDATE ' + DB.OPENSTACK_TBL + \ ' SET gw_ratio = \"' + str(json_ratio) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE TRAFFIC GW INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] TRAFFIC GW DB Update Fail.') except: LOG.exception() in_out_dic = dict() in_out_dic['gw_list'] = gw_list in_out_dic['gw_total'] = total_cnt pre_stat[node_name + '_GW'] = in_out_dic except: LOG.exception() status = 'fail' if not status == 'ok': reason.append(json_ratio) return status, pre_stat, reason
def vrouter_check(conn, db_log, node_name, user_name, node_ip): ret_docker = 'ok' docker_list = [] fail_list = [] onos_id = '' docker_rt = SshCommand.ssh_exec(user_name, node_ip, 'sudo docker ps') if docker_rt is not None: try: for docker in CONF.openstack()['docker_list']: for line in docker_rt.splitlines(): if line.startswith('CONTAINER'): continue tmp_line = line.split() if ' ' + docker in line: if not 'Up' in line: docker_json = {'name': docker, 'status': 'nok', 'type': 'docker'} fail_list.append(docker_json) ret_docker = 'nok' else: docker_json = {'name': docker, 'status': 'ok', 'type': 'docker'} docker_list.append(docker_json) if 'onos' in tmp_line[1]: onos_id = tmp_line[0] except: LOG.exception() else: LOG.error("\'%s\' Vrouter Node Check Error", node_ip) ret_docker = 'fail' onos_app_list = [] route_list = [] if not onos_id == '': try: # get onos container ip onos_rt = SshCommand.ssh_exec(user_name, node_ip, 'sudo docker inspect ' + onos_id + ' | grep IPAddress') if onos_rt is not None: for line in onos_rt.splitlines(): line = line.strip() if line.startswith('\"IPAddress'): tmp = line.split(':') onos_ip = tmp[1].strip().replace('\"', '').replace(',', '') break app_list = SshCommand.ssh_pexpect(user_name, node_ip, onos_ip, 'apps -a -s') app_active_list = list() for line in app_list.splitlines(): if line.startswith('fail'): continue app_active_list.append(line.split(".")[2].split()[0]) for app in CONF.openstack()['onos_vrouter_app_list']: if app in app_active_list: app_json = {'name': app, 'status': 'ok', 'type': 'onos_app'} else: app_json = {'name': app, 'status': 'nok', 'type': 'onos_app'} fail_list.append(app_json) ret_docker = 'nok' onos_app_list.append(app_json) str_route = SshCommand.ssh_pexpect(user_name, node_ip, onos_ip, 'routes') for line in str_route.splitlines(): line = line.strip() if (line.startswith('Table') or line.startswith('Network') or line.startswith('Total')): continue new_line = " ".join(line.split()) if new_line.startswith('fail'): continue tmp = new_line.split(' ') route_json = {'network': tmp[0], 'next_hop': tmp[1]} route_list.append(route_json) except: LOG.exception() else: LOG.info('can not find onos_id.') ret_docker = 'fail' try: sql = 'UPDATE ' + DB.OPENSTACK_TBL + \ ' SET docker = \"' + str(docker_list) + '\",' + \ ' onosApp = \"' + str(onos_app_list) + '\",' + \ ' routingTable = \"' + str(route_list) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE GATEWAY INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] GATEWAY DB Update Fail.') except: LOG.exception() return ret_docker, fail_list
def rest_server_start(): LOG.info("--- REST Server Start --- ") rest_server_daemon = multiprocess.Process(name='rest_server', target=run) rest_server_daemon.daemon = True rest_server_daemon.start()
if pid > 0: # exit from second parent sys.exit(0) except OSError, e: sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror)) sys.exit(1) # redirect standard file descriptors si = file(self.stdin, 'r') so = file(self.stdout, 'a+') se = file(self.stderr, 'a+', 0) pid = str(os.getpid()) LOG.info("--- Daemon START ---") sys.stderr.write("\nstarted with pid %s\n" % pid) sys.stderr.flush() if self.pidfile: file(self.pidfile, 'w+').write("%s\n" % pid) atexit.register(self.delpid) os.dup2(si.fileno(), sys.stdin.fileno()) os.dup2(so.fileno(), sys.stdout.fileno()) os.dup2(se.fileno(), sys.stderr.fileno()) # delete pid file when parent process kill def delpid(self): try: os.remove(self.pidfile)
def get_node_traffic(conn, db_log, node_name, rx_dic, tx_dic, total_rx, total_tx, err_info, pre_stat): try: status = 'ok' reason_list = [] pre_total_rx = total_rx pre_total_tx = total_tx # check minimum packet count sql = 'SELECT data_ip FROM ' + DB.OPENSTACK_TBL + ' WHERE nodename = \'' + node_name + '\'' data_ip = conn.cursor().execute(sql).fetchone()[0] sql = 'SELECT ip_addr FROM ' + DB.NODE_INFO_TBL + ' WHERE type = \'ONOS\'' nodes_info = conn.cursor().execute(sql).fetchall() min_rx = 0 if len(nodes_info) == 0: LOG.info('Fail to load onos list') status = 'fail' else: for ip in nodes_info: flows_rt = SshCommand.onos_ssh_exec(ip[0], '\"flows --filter \'{tunnelDst=' + data_ip + '}\' --short\"') if flows_rt is not None: for line in flows_rt.splitlines(): if 'tunnelDst' in line: min_rx = min_rx + int(line.split(',')[2].split('=')[1]) break if not dict(pre_stat).has_key(node_name + '_VXLAN'): status = '-' ratio = -1 else: total_rx = total_rx - int(dict(pre_stat)[node_name + '_VXLAN']['total_rx']) total_tx = total_tx - int(dict(pre_stat)[node_name + '_VXLAN']['total_tx']) cur_min = min_rx - int(dict(pre_stat)[node_name + '_VXLAN']['min_rx']) if total_rx == 0 and total_tx == 0: ratio = 100 elif total_tx <= 0 or total_tx < 0: LOG.info('Node Traffic Ratio Fail.') ratio = 0 else: ratio = float(total_rx) * 100 / total_tx LOG.info('Node Traffic Ratio = ' + str(ratio)) port_json = {'rx': rx_dic[node_name], 'minimum_rx': min_rx, 'rx_drop': err_info['rx_drop'], 'rx_errs': err_info['rx_err'], 'tx': tx_dic[node_name], 'tx_drop': err_info['tx_drop'], 'tx_errs': err_info['tx_err']} description = '' if not status == '-': description = 'Ratio of success for all nodes = ' + str(ratio) + ' (' + str(total_rx) + ' / ' + str(total_tx) + ')' if ratio < float(CONF.alarm()['node_traffic_ratio']): LOG.info('[NODE TRAFFIC] ratio nok') status = 'nok' if total_rx < cur_min: LOG.info('CUR_MIN_RX = ' + str(cur_min) + ', CUR_RX = ' + str(total_rx) + ', Less than rx minimum.') status = 'nok' if err_info['rx_drop'] - int(dict(pre_stat)[node_name + '_VXLAN']['rx_drop']) > 0: LOG.info('[NODE TRAFFIC] rx_drop nok') status = 'nok' if err_info['rx_err'] - int(dict(pre_stat)[node_name + '_VXLAN']['rx_err']) > 0: LOG.info('[NODE TRAFFIC] rx_err nok') status = 'nok' if err_info['tx_drop'] - int(dict(pre_stat)[node_name + '_VXLAN']['tx_drop']) > 0: LOG.info('[NODE TRAFFIC] tx_drop nok') status = 'nok' if err_info['tx_err'] - int(dict(pre_stat)[node_name + '_VXLAN']['tx_err']) > 0: LOG.info('[NODE TRAFFIC] tx_err nok') status = 'nok' in_out_dic = dict() in_out_dic['total_rx'] = pre_total_rx in_out_dic['total_tx'] = pre_total_tx in_out_dic['min_rx'] = min_rx in_out_dic['rx_drop'] = err_info['rx_drop'] in_out_dic['rx_err'] = err_info['rx_err'] in_out_dic['tx_drop'] = err_info['tx_drop'] in_out_dic['tx_err'] = err_info['tx_err'] pre_stat[node_name + '_VXLAN'] = in_out_dic except: LOG.exception() status = 'fail' vxlan_json = {'port_stat_vxlan': port_json, 'period': CONF.watchdog()['interval'], 'ratio': format(ratio, '.2f'), 'current_rx': total_rx, 'current_tx': total_tx, 'description': description, 'threshold': CONF.alarm()['node_traffic_ratio'], 'status': status} try: sql = 'UPDATE ' + DB.OPENSTACK_TBL + \ ' SET vxlan_traffic = \"' + str(vxlan_json) + '\"' + \ ' WHERE nodename = \'' + node_name + '\'' db_log.write_log('----- UPDATE VXLAN STAT INFO -----\n' + sql) if DB.sql_execute(sql, conn) != 'SUCCESS': db_log.write_log('[FAIL] VXLAN STAT DB Update Fail.') except: LOG.exception() if not status == 'ok': reason_list.append(vxlan_json) return status, pre_stat, reason_list