def orphan(parser): logging.disable(logging.INFO) if not NODE_ROLE.is_controller(): LOG.warn('This command can only run on controller node !') return # run delete orphan # run delete servers thread first nova_thread = RunNovaThread() nova_thread.start() nova_thread.join() # run other thread parallel threads = [ RunCinderThread(), RunGlanceThread(), RunNetBaseThread(), RunFirewallThread(), RunSecgroupThread(), RunVPNThread(), RunLBThread(), RunQoSThread() ] for thread in threads: thread.start() for thread in threads: thread.join() logging.disable(logging.NOTSET)
def base_delete(self, resource_name, resource_ids, delete_func): no_log_resources = [] while resource_ids: for resource_id in resource_ids: # avoid LOG delete info many times if resource_id not in no_log_resources: with log_disabled(): LOG.info('Delete %s [%s]' % (resource_name, resource_id)) no_log_resources.append(resource_id) try: delete_func(resource_id) # delete successfully, break resource_ids.remove(resource_id) break except Conflict: # retry: deal with conflict. continue except NotFound: # when call destroy_volume(), # will delete volumes and snapshots, # if snapshots NotFound, do nothing. resource_ids.remove(resource_id) break except Exception as e: LOG.warn('Can not delete %s [%s]' % (resource_name, resource_id)) LOG.error(e) # something else wrong, break, won't retry resource_ids.remove(resource_id) break
def _run(self): self.base_delete('floating ip', self.floatingips, neutronclient.delete_floatingip) for port_id in self.ports: try: with log_disabled(): LOG.info('Delete port [%s]' % port_id) neutronclient.delete_port(port_id) except Conflict as e: with log_disabled(): LOG.info(' Solving conflict: remove interface...') router_id = neutronclient.show_port( port_id)['port']['device_id'] neutronclient.remove_interface_router(router_id, {'port_id': port_id}) except Exception as e: LOG.warn('Can not delete port [%s]' % port_id) LOG.error(e) # if firewall create with target router, # CAN NOT delete router before firewall is deleted. # NOTE: already add retry self.base_delete('router', self.routers, neutronclient.delete_router) self.base_delete('subnet', self.subnets, neutronclient.delete_subnet) self.base_delete('network', self.networks, neutronclient.delete_network)
def ami(parser): if not NODE_ROLE.is_controller(): LOG.warn('This command can only run on controller node !') else: # "if controller leave to last" if not parser.KERNEL_FILE and not parser.INITRD_FILE and not parser.IMAGE_FILE: LOG.error( 'Lack of arguments, you can use --help to get help infomation\n' ) elif not parser.KERNEL_FILE: LOG.error('Please specify the kernel file\n') elif not parser.INITRD_FILE: LOG.error('Please specify the initrd file\n') elif not parser.IMAGE_FILE: LOG.error('Please specify the image file\n') else: if parser.NAME: # split the path and filename kernel_file_name = os.path.basename(r'%s' % parser.KERNEL_FILE) initrd_file_name = os.path.basename(r'%s' % parser.INITRD_FILE) ami_image_upload(parser.KERNEL_FILE, kernel_file_name, parser.INITRD_FILE, initrd_file_name, parser.IMAGE_FILE, parser.NAME) else: # if not specify image name, use IMAGE_FILE as AMI name # split the path and filename kernel_file_name = os.path.basename(r'%s' % parser.KERNEL_FILE) initrd_file_name = os.path.basename(r'%s' % parser.INITRD_FILE) ami_image_name = os.path.basename(r'%s' % parser.IMAGE_FILE) ami_image_upload(parser.KERNEL_FILE, kernel_file_name, parser.INITRD_FILE, initrd_file_name, parser.IMAGE_FILE, ami_image_name)
def check_ceph(): # node role check if not NODE_ROLE.is_fuel(): if not NODE_ROLE.is_controller(): if not NODE_ROLE.is_ceph_osd(): LOG.warn('This command can only run on fuel or controller or ceph-osd node !') return if NODE_ROLE.is_fuel(): check_all_nodes('ceph') return # get cluster status LOG.info('%s%s Checking ceph cluster status' %('='*5, '>')) ceph_check_health() # check osd status LOG.info('%s%s Checking ceph osd status' %('='*5, '>')) check_success = True osd_status = get_ceph_osd_status() if not osd_status: LOG.error('Can not get ceph osd status !') check_success = False else: for l in osd_status.split('\n'): if 'id' not in l and 'weigh' not in l and 'osd.' in l: osd = l.split()[2] status = l.split()[3] if status != 'up': LOG.error('%s status is not correct, please check it !' % osd) check_success = False if check_success: LOG.info('Ceph osd status check successfully !')
def detach_volume(attached_servers, volume_id): LOG.info('Detaching volume "%s" .' % volume_id) volume_bootable = get_volume_info(volume_id).bootable if volume_bootable == 'false': # detach volume from instance by python sdk first logging.disable(logging.INFO) for server_id in attached_servers: pc.nova_delete_server_volume(server_id, volume_id) logging.disable(logging.NOTSET) t = 0 while t <= 14: volume_status = get_volume_info(volume_id).status if volume_status == 'available': break time.sleep(3) t+=3 # if timeout, detach-disk by virsh on compute node & update database if get_volume_info(volume_id).status != 'available': if detach_disk_on_compute_node(attached_servers, volume_id): # update database LOG.info(' Updating database.') # NOTE use UTC time detach_at = time.strftime('%Y-%m-%d %X', time.gmtime()) sql_update_cinder_db = 'UPDATE volumes SET status="available",attach_status="detached" WHERE id="%s";' % volume_id cinder_db.connect(sql_update_cinder_db) for server_id in attached_servers: sql_update_nova_db = 'UPDATE block_device_mapping SET deleted_at="%s",deleted=id WHERE instance_uuid="%s" and volume_id="%s" and deleted=0;' % (detach_at, server_id, volume_id) nova_db.connect(sql_update_nova_db) if get_volume_info(volume_id).status == 'available': return True else: LOG.warn('Can not detach root device. Please delete instance "%s" first.' % attached_servers) return False
def check_all_nodes(check_obj): if check_obj is 'all': if LOG.enable_debug: check_cmd = 'sudo eayunstack --debug doctor cls --all' else: check_cmd = 'sudo eayunstack doctor cls --all' else: if LOG.enable_debug: check_cmd = 'sudo eayunstack --debug doctor cls -n %s' % check_obj else: check_cmd = 'sudo eayunstack doctor cls -n %s' % check_obj # get controller node list node_list = get_node_list('controller') # ssh to all controller node to check obj if len(node_list) == 0: LOG.warn('Node list is null !') return else: if check_obj == 'ceph': # only need to check one node for ceph cluster ceph_node = node_list[0] run_doctor_cmd_on_node('controller', ceph_node, check_cmd) else: nodes = [] for node in node_list: node_info = {} node_info['role'] = 'controller' node_info['name'] = node nodes.append(node_info) result = run_doctor_on_nodes(nodes, check_cmd) for res in result: LOG.info(res, remote=True)
def check_mysql(): # node role check if not NODE_ROLE.is_fuel(): if not NODE_ROLE.is_controller(): LOG.warn('This command can only run on fuel or controller node !') return if NODE_ROLE.is_fuel(): check_all_nodes('mysql') return LOG.info('%s%s Checking mysql cluster status' %('='*5, '>')) # get running node list for mysql cluster running_nodes = get_mysql_nodes() if running_nodes is None: LOG.error('Can not get the running node list for mysql cluster !') return # get all controller node hostname controllers = get_controllers_hostname() if controllers is None: LOG.error('Can not get the controllers node list !') return # check all controller node in mysql cluster error_nodes = [] for node in controllers: if node not in running_nodes: error_nodes.append(node) if error_nodes: LOG.error('Node %s is not running in mysql cluster !' % error_nodes) LOG.error('Mysql cluster check faild !') else: LOG.info('Mysql cluster check successfully !')
def ami(parser): if not NODE_ROLE.is_controller(): LOG.warn('This command can only run on controller node !') else: # "if controller leave to last" if not parser.KERNEL_FILE and not parser.INITRD_FILE and not parser.IMAGE_FILE: LOG.error('Lack of arguments, you can use --help to get help infomation\n') elif not parser.KERNEL_FILE: LOG.error('Please specify the kernel file\n') elif not parser.INITRD_FILE: LOG.error('Please specify the initrd file\n') elif not parser.IMAGE_FILE: LOG.error('Please specify the image file\n') else: if parser.NAME: # split the path and filename kernel_file_name = os.path.basename(r'%s' % parser.KERNEL_FILE) initrd_file_name = os.path.basename(r'%s' % parser.INITRD_FILE) ami_image_upload(parser.KERNEL_FILE, kernel_file_name, parser.INITRD_FILE, initrd_file_name, parser.IMAGE_FILE, parser.NAME) else: # if not specify image name, use IMAGE_FILE as AMI name # split the path and filename kernel_file_name = os.path.basename(r'%s' % parser.KERNEL_FILE) initrd_file_name = os.path.basename(r'%s' % parser.INITRD_FILE) ami_image_name = os.path.basename(r'%s' % parser.IMAGE_FILE) ami_image_upload(parser.KERNEL_FILE, kernel_file_name, parser.INITRD_FILE, initrd_file_name, parser.IMAGE_FILE, ami_image_name)
def check_all_nodes(check_obj): if check_obj is 'all': if LOG.enable_debug: check_cmd = 'sudo eayunstack --debug doctor cls --all' else: check_cmd = 'sudo eayunstack doctor cls --all' else: if LOG.enable_debug: check_cmd = 'sudo eayunstack --debug doctor cls -n %s' % check_obj else: check_cmd = 'sudo eayunstack doctor cls -n %s' % check_obj # get controller node list node_list = get_node_list('controller') # ssh to all controller node to check obj if len(node_list) == 0: LOG.warn('Node list is null !') return else: if check_obj == 'ceph': # only need to check one node for ceph cluster ceph_node = node_list[0] run_doctor_cmd_on_node('controller', ceph_node, check_cmd) else: (proc_list, pipe) = run_doctor_on_nodes('controller', node_list, check_cmd) for proc in proc_list: proc.join() LOG.info(pipe.recv(), remote=True)
def check_rabbitmq(): # node role check if not NODE_ROLE.is_fuel(): if not NODE_ROLE.is_controller(): LOG.warn('This command can only run on fuel or controller node !') return if NODE_ROLE.is_fuel(): check_all_nodes('rabbitmq') return LOG.info('%s%s Checking rabbitmq cluster status' %('='*5, '>')) # get all controller node hostname controllers = get_controllers_hostname() if controllers is None: LOG.error('Can not get the controllers node list !') return # get masters & slaves node list running_nodes = get_rabbitmq_nodes() if running_nodes is None: LOG.error('Can not get the running node list for rabbitmq cluster !') return # check all controller nodes in masters + slaves node list error_nodes = [] for node in controllers: if node.split('.')[0] not in running_nodes: error_nodes.append(node) if error_nodes: LOG.error('Node %s not in rabbitmq cluster !' % error_nodes) LOG.error('Rabbitmq cluster check faild !') else: LOG.info('Rabbitmq cluster check successfully !')
def check_selinux(): # Correct state [enforcing, permissive, disabled] correct_state = correct_conf = "disabled" # check current state (s, out) = commands.getstatusoutput('getenforce') current_state = out if s != 0: LOG.error('getenforce error, please check it') else: if current_state == correct_state.capitalize(): LOG.debug('SELinux current state is: %s' % current_state) else: LOG.warn('SELinux current state is: %s' % current_state) LOG.error('SELinux state need to be %s ' % correct_state.capitalize()) # check profile /etc/sysconfig/selinux current_conf = commands.getoutput( 'grep "^SELINUX=" /etc/sysconfig/selinux | cut -d "=" -f 2') if current_conf == correct_conf: LOG.debug('SELinux current conf in profile is: %s' % current_conf) else: LOG.warn('SELinux current conf in profile is: %s' % current_conf) LOG.error('SELinux configuration in profile need to be %s ' % correct_conf)
def check_all_nodes(check_obj): if check_obj is 'all': if LOG.enable_debug: check_cmd = 'sudo eayunstack --debug doctor cls --all' else: check_cmd = 'sudo eayunstack doctor cls --all' else: if LOG.enable_debug: check_cmd = 'sudo eayunstack --debug doctor cls -n %s' % check_obj else: check_cmd = 'sudo eayunstack doctor cls -n %s' % check_obj # get controller node list node_list = get_node_list('controller') # ssh to all controller node to check obj if len(node_list) == 0: LOG.warn('Node list is null !') return else: if check_obj == 'ceph': # only need to check one node for ceph cluster ceph_node = node_list[0] LOG.info('%s Role: %-10s Node: %-13s %s' % ('*'*15, 'controller', ceph_node, '*'*15)) ssh_connect2(ceph_node, check_cmd) else: for node in node_list: LOG.info('%s Role: %-10s Node: %-13s %s' % ('*'*15, 'controller', node, '*'*15)) ssh_connect2(node, check_cmd)
def check_all_nodes(check_obj): if check_obj is "all": if LOG.enable_debug: check_cmd = "sudo eayunstack --debug doctor cls --all" else: check_cmd = "sudo eayunstack doctor cls --all" else: if LOG.enable_debug: check_cmd = "sudo eayunstack --debug doctor cls -n %s" % check_obj else: check_cmd = "sudo eayunstack doctor cls -n %s" % check_obj # get controller node list node_list = get_node_list("controller") # ssh to all controller node to check obj if len(node_list) == 0: LOG.warn("Node list is null !") return else: if check_obj == "ceph": # only need to check one node for ceph cluster ceph_node = node_list[0] LOG.info("%s Role: %-10s Node: %-13s %s" % ("*" * 15, "controller", ceph_node, "*" * 15)) ssh_connect2(ceph_node, check_cmd) else: for node in node_list: LOG.info("%s Role: %-10s Node: %-13s %s" % ("*" * 15, "controller", node, "*" * 15)) ssh_connect2(node, check_cmd)
def check_all_nodes(check_obj): if check_obj is 'all': if LOG.enable_debug: check_cmd = 'sudo eayunstack --debug doctor cls --all' else: check_cmd = 'sudo eayunstack doctor cls --all' else: if LOG.enable_debug: check_cmd = 'sudo eayunstack --debug doctor cls -n %s' % check_obj else: check_cmd = 'sudo eayunstack doctor cls -n %s' % check_obj # get controller node list node_list = get_node_list('controller') # ssh to all controller node to check obj if len(node_list) == 0: LOG.warn('Node list is null !') return else: if check_obj == 'ceph': # only need to check one node for ceph cluster ceph_node = node_list[0] res = run_doctor_cmd_on_node('controller', ceph_node, check_cmd) LOG.info(res, remote=True) else: nodes = [] for node in node_list: node_info = {} node_info['role'] = 'controller' node_info['name'] = node nodes.append(node_info) result = run_doctor_on_nodes(nodes, check_cmd) for res in result: LOG.info(res, remote=True)
def check_rabbitmq(): # node role check if not NODE_ROLE.is_fuel(): if not NODE_ROLE.is_controller(): LOG.warn('This command can only run on fuel or controller node !') return if NODE_ROLE.is_fuel(): check_all_nodes('rabbitmq') return LOG.info('%s%s Checking rabbitmq cluster status' % ('=' * 5, '>')) # get all controller node hostname controllers = get_controllers_hostname() if controllers is None: LOG.error('Can not get the controllers node list !') return # get masters & slaves node list running_nodes = get_rabbitmq_nodes() if running_nodes is None: LOG.error('Can not get the running node list for rabbitmq cluster !') return # check all controller nodes in masters + slaves node list error_nodes = [] for node in controllers: if node.split('.')[0] not in running_nodes: error_nodes.append(node) if error_nodes: LOG.error('Node %s not in rabbitmq cluster !' % error_nodes) LOG.error('Rabbitmq cluster check faild !') else: LOG.info('Rabbitmq cluster check successfully !')
def check_mysql(): # node role check if not NODE_ROLE.is_fuel(): if not NODE_ROLE.is_controller(): LOG.warn('This command can only run on fuel or controller node !') return if NODE_ROLE.is_fuel(): check_all_nodes('mysql') return LOG.info('%s%s Checking mysql cluster status' % ('=' * 5, '>')) # get running node list for mysql cluster running_nodes = get_mysql_nodes() if running_nodes is None: LOG.error('Can not get the running node list for mysql cluster !') return # get all controller node hostname controllers = get_controllers_hostname() if controllers is None: LOG.error('Can not get the controllers node list !') return # check all controller node in mysql cluster error_nodes = [] for node in controllers: if node not in running_nodes: error_nodes.append(node) if error_nodes: LOG.error('Node %s is not running in mysql cluster !' % error_nodes) LOG.error('Mysql cluster check faild !') else: LOG.info('Mysql cluster check successfully !')
def check_neutron_agents(agents_list): dhcp_agent_alive_node_number = 0 dhcp_agent_not_alive_node = [] for agent in agents_list: _msg_admin_state = ( 'Neutron agent %s on %s admin_state_up is %s' % (agent['binary'], agent['host'], str(agent['admin_state_up']))) _msg_not_alive = ('Neutron agent %s on %s is not alive' % (agent['binary'], agent['host'])) if not agent['admin_state_up']: LOG.warn(_msg_admin_state) else: LOG.debug(_msg_admin_state) if not agent['alive']: if agent['binary'] == 'neutron-dhcp-agent': LOG.debug(_msg_not_alive) dhcp_agent_not_alive_node.append(agent) else: LOG.error(_msg_not_alive) else: LOG.debug('Neutron agent %s on %s is alive' % (agent['binary'], agent['host'])) if agent['binary'] == 'neutron-dhcp-agent': dhcp_agent_alive_node_number += 1 # NOTE:at least one dhcp-agent is alive is ok if dhcp_agent_alive_node_number < 1: for agent in dhcp_agent_not_alive_node: LOG.error('Neutron agent %s on %s is not alive' % (agent['binary'], agent['host']))
def check_ceph(): # node role check if not NODE_ROLE.is_fuel(): if not NODE_ROLE.is_controller(): if not NODE_ROLE.is_ceph_osd(): LOG.warn( 'This command can only run on fuel or controller or ceph-osd node !' ) return if NODE_ROLE.is_fuel(): check_all_nodes('ceph') return # get cluster status LOG.info('%s%s Checking ceph cluster status' % ('=' * 5, '>')) ceph_check_health() # check osd status LOG.info('%s%s Checking ceph osd status' % ('=' * 5, '>')) check_success = True osd_status = get_ceph_osd_status() if not osd_status: LOG.error('Can not get ceph osd status !') check_success = False else: for l in osd_status.split('\n'): if 'id' not in l and 'weigh' not in l and 'osd.' in l: osd = l.split()[2] status = l.split()[3] if status != 'up': LOG.error('%s status is not correct, please check it !' % osd) check_success = False if check_success: LOG.info('Ceph osd status check successfully !')
def check_all_nodes(check_obj): if check_obj is 'all': if LOG.enable_debug: check_cmd = 'sudo eayunstack --debug doctor cls --all' else: check_cmd = 'sudo eayunstack doctor cls --all' else: if LOG.enable_debug: check_cmd = 'sudo eayunstack --debug doctor cls -n %s' % check_obj else: check_cmd = 'sudo eayunstack doctor cls -n %s' % check_obj # get controller node list node_list = get_node_list('controller') # ssh to all controller node to check obj if len(node_list) == 0: LOG.warn('Node list is null !') return else: if check_obj == 'ceph': # only need to check one node for ceph cluster ceph_node = node_list[0] run_doctor_cmd_on_node('controller', ceph_node, check_cmd) else: proc_list = run_doctor_on_nodes('controller', node_list, check_cmd) for proc in proc_list: proc.join()
def init(parser): if NODE_ROLE.is_unknown(): LOG.error('Can not confirm the node role!') if not NODE_ROLE.is_fuel(): LOG.warn('This command can only run on fuel node !') return init_node_list_file() init_node_role_file()
def deployment_monitor_plugins(parser): if not NODE_ROLE.is_fuel(): LOG.warn('This command can only run on fuel node !') return if parser.INFLUXDB: deployment_influxdb_grafana(parser.ENV) if parser.LMA_COLLECTOR: deployment_lma_collector(parser.ENV)
def check_services(services_list): for service in services_list: if service['status'] != 'enabled': LOG.warn('Service %s on %s status is %s' % (service['binary'], service['host'], service['status'])) if service['state'] != 'up': LOG.error('Service %s on %s state is %s' % (service['binary'], service['host'], service['state']))
def init(parser): if NODE_ROLE.is_unknown(): LOG.error("Can not confirm the node role!") if not NODE_ROLE.is_fuel(): LOG.warn("This command can only run on fuel node !") return init_node_list_file() init_node_role_file()
def check_disk(): limit = 85 vfs = os.statvfs("/") # get "/" filesystem space used percent used_percent = int(math.ceil((float(vfs.f_blocks - vfs.f_bavail) / float(vfs.f_blocks)) * 100)) if used_percent >= 0 and used_percent < limit: LOG.debug('The "/" filesystem used %s%% space !' % used_percent) elif used_percent >= limit: LOG.warn('The "/" filesystem used %s%% space !' % used_percent)
def volume(parser): if not NODE_ROLE.is_controller(): LOG.warn('This command can only run on controller node !') return if parser.DESTROY_VOLUME: if not parser.ID: LOG.error('Please use [--id ID] to specify the volume ID !') else: volume_id = parser.ID destroy_volume(volume_id)
def check_profile(profile, role): # if the profile file is not exists, go back if not os.path.exists(profile): LOG.error('Can not find this profile. Abort this check!') return # get template path template = get_template_path(profile, role) # if the template file is not exists, go back if not os.path.exists(template): LOG.error('Template file is missing, Please check it by yourself.') return if role is not 'mongo': # check file resolvability, if not resolvability, go back for filepath in (profile, template): if not check_file_resolvability(filepath): return # Check profile keys check_list = get_check_list(profile) miss_keys = [] for section in sorted(check_list.keys()): for key in check_list[section]: (miss_key, current_value) = check_key(section, key, profile, template) if miss_key: if '[' + section + ']' not in miss_keys: miss_keys.append('[' + section + ']') miss_keys.append(key + ' = ' + current_value) else: miss_keys.append(key + ' = ' + current_value) if miss_keys: LOG.warn( 'Can not check following option, please check it by yourself. ' ) for entry in miss_keys: fmt_print(entry) # some keys in template but not in profile(named lost keys) t_check_list = get_check_list(template) for t_section in sorted(t_check_list.keys()): for t_key in t_check_list[t_section]: check_lost_key(t_section, t_key, profile) else: # Check profile keys check_list = get_check_list_common(profile) for key in check_list: check_key_common(key, profile, template) # some keys in template but not in profile(named lost keys) t_check_list = get_check_list_common(template) for t_key in t_check_list: check_lost_key_common(t_key, profile)
def check_profile(profile, role): # if the profile file is not exists, go back if not os.path.exists(profile): LOG.error('Can not find this profile. Abort this check!') return # get template path template = get_template_path(profile, role) # if the template file is not exists, go back if not os.path.exists(template): LOG.error('Template file is missing, Please check it by yourself.') return if role is not 'mongo': # check file resolvability, if not resolvability, go back for filepath in (profile, template): if not check_file_resolvability(filepath): return # Check profile keys check_list = get_check_list(profile) miss_keys = [] for section in sorted(check_list.keys()): for key in check_list[section]: (miss_key, current_value) = check_key(section, key, profile, template) if miss_key: if '[' + section + ']' not in miss_keys: miss_keys.append('[' + section + ']') miss_keys.append(key + ' = ' + current_value) else: miss_keys.append(key + ' = ' + current_value) if miss_keys: LOG.warn('Can not check following option, please check it by yourself. ') for entry in miss_keys: fmt_print(entry) # some keys in template but not in profile(named lost keys) t_check_list = get_check_list(template) for t_section in sorted(t_check_list.keys()): for t_key in t_check_list[t_section]: check_lost_key(t_section, t_key, profile) else: # Check profile keys check_list = get_check_list_common(profile) for key in check_list: check_key_common(key, profile, template) # some keys in template but not in profile(named lost keys) t_check_list = get_check_list_common(template) for t_key in t_check_list: check_lost_key_common(t_key, profile)
def check_lost_key(section, key, profile): p = ConfigParser.ConfigParser() p.read(profile) try: dict(p.items(section))[key] except ConfigParser.NoSectionError: LOG.warn('Lost section [%s] in this profile.' % section) except KeyError: LOG.warn('Lost following option in this profile. Please check it.') fmt_print('[%s]' % section) fmt_print(key)
def check_disk(): limit = 85 vfs = os.statvfs("/") # get "/" filesystem space used percent used_percent = int( math.ceil( (float(vfs.f_blocks - vfs.f_bavail) / float(vfs.f_blocks)) * 100)) if used_percent >= 0 and used_percent < limit: LOG.debug('The "/" filesystem used %s%% space !' % used_percent) elif used_percent >= limit: LOG.warn('The "/" filesystem used %s%% space !' % used_percent)
def check(role, obj): if NODE_ROLE.is_fuel(): check_nodes(role, obj) else: if not eval('NODE_ROLE.is_%s' % role)(): LOG.warn('This command can only run on fuel or %s node !' % role) else: if obj == 'all': eval('check_%s_%s' % (role, 'profile'))() eval('check_%s_%s' % (role, 'service'))() else: eval('check_%s_%s' % (role, obj))()
def init(parser): if NODE_ROLE.is_unknown(): LOG.error('Can not confirm the node role!') if not NODE_ROLE.is_fuel(): LOG.warn('This command can only run on fuel node !') return if parser.UPDATE: update() return init_env() init_node_list_file() init_node_role_file()
def determine_delete_instance(instance_id, instance_state): if instance_state != 'ERROR': LOG.warn('Instance is not in "ERROR" status. Can not delete it!') return False else: while True: determine = raw_input('Instance "%s" is in "ERROR" status. Do you really want to delete it? [yes/no]: ' % instance_id) if determine in ['yes','no']: break if determine == 'yes': return True else: return False
def instance(parser): if not NODE_ROLE.is_controller(): LOG.warn('This command can only run on controller node !') return if parser.DELETE_INTANCE: if not parser.ID: LOG.error('Please use [--id ID] to specify the instance ID !') else: instance_id = parser.ID if parser.DELETE_DISK: delete_instance(instance_id, delete_disk=True) else: delete_instance(instance_id)
def check_key_common(key, profile, template): current_value = get_value_common(key, profile) correct_value = get_value_common(key, template) filterfile = template + '.filter' if os.path.exists(filterfile): if get_value_common(key, filterfile) is '': LOG.debug('"%s = %s" option in the filter file, skip check this option.' % (key, current_value)) elif not correct_value: LOG.warn('Can not check following option, please check it by yourself. ') fmt_print('%s=%s' % (key, current_value)) elif current_value != correct_value: LOG.error('"%s" option check faild' % key) fmt_print('Current is "%s=%s"' % (key, current_value)) fmt_print('Correct is "%s=%s"' % (key, correct_value))
def check_rabbitmq_queues(except_queues=None): messages_warn_limit = 100 # unit: Byte memory_warn_limit = 1048576 queues_list = get_rabbitmq_queues_list() for queue in queues_list: if int(queue['messages']) > messages_warn_limit \ or int(queue['memory']) > memory_warn_limit: if except_queues and queue['name'] in except_queues: continue (mem_size, mem_unit) = bytes2human(int(queue['memory'])) LOG.warn( "Queue %s has %s messages and has been used %s %s memory." % (queue['name'], queue['messages'], mem_size, mem_unit))
def check_all(): '''Check All Cluster''' # node role check if not NODE_ROLE.is_fuel(): if not NODE_ROLE.is_controller(): LOG.warn('This command can only run on fuel or controller node !') return if NODE_ROLE.is_fuel(): check_all_nodes('all') else: check_rabbitmq() check_mysql() check_haproxy() check_ceph()
def detach_volume(attached_servers, volume_id): LOG.info('Detaching volume "%s" .' % volume_id) volume_bootable = get_volume_info(volume_id).bootable if volume_bootable == 'false': # detach volume from instance by python sdk first logging.disable(logging.INFO) for server_id in attached_servers: pc.nova_delete_server_volume(server_id, volume_id) logging.disable(logging.NOTSET) t = 0 while t <= 14: volume_status = get_volume_info(volume_id).status if volume_status == 'available': break time.sleep(3) t += 3 # if timeout, detach-disk by virsh on compute node & update database if get_volume_info(volume_id).status != 'available': if detach_disk_on_compute_node(attached_servers, volume_id): # update database LOG.info(' Updating database.') db_set_volume_detached() for server_id in attached_servers: sql_update_nova_db = 'UPDATE block_device_mapping SET deleted_at="%s",deleted=id WHERE instance_uuid="%s" and volume_id="%s" and deleted=0;' % ( detach_at, server_id, volume_id) nova_db.connect(sql_update_nova_db) if get_volume_info(volume_id).status == 'available': return True else: # check instance was deleted for attached_server in attached_servers: sql_get_instance_deleted_status = \ 'SELECT deleted from instances where uuid=\'%s\';' \ % attached_server instance_deleted_status = \ nova_db.connect(sql_get_instance_deleted_status)[0][0] if instance_deleted_status == 1: continue else: LOG.warn('Please delete instance "%s" first.' % attached_servers) return False # if instance was deleted, set volume attach_status to detached if determine_set_volume_to_detached(attached_servers): LOG.info('Set volume %s attach status to detached' % volume_id) db_set_volume_detached() return True else: LOG.warn('Please delete instance "%s" first.' % attached_servers) return False
def check_rabbitmqrestart(): if NODE_ROLE.is_controller(): log_path = '/.eayunstack/rabbitmq_start_time' start_time = _get_from_ps() if os.path.exists(log_path): log_start_time = _get_from_log(log_path) if log_start_time == start_time: LOG.debug('service rabbitmq has never been restart') else: LOG.warn('service rabbitmq has been restart at %s' % start_time) _log_time(log_path, start_time) else: LOG.debug('the log file is not found') _log_time(log_path, start_time)
def check_all(): '''Check All Cluster''' # node role check if not NODE_ROLE.is_fuel(): if not NODE_ROLE.is_controller(): LOG.warn('This command can only run on fuel or controller node !') return if NODE_ROLE.is_fuel(): check_all_nodes('all') else: check_rabbitmq() check_mysql() check_haproxy() check_ceph() check_pacemaker() check_cephspace()
def check_cephspace(): # node role check if NODE_ROLE.is_controller(): LOG.info('%s%s Checking ceph space' % ('=' * 5, '>')) ceph_space = get_ceph_space() limit_war = 83 limit_error = 93 if ceph_space >= 0 and ceph_space < limit_war: LOG.info('The ceph space is used: %s%%' % ceph_space) elif ceph_space >= limit_war and ceph_space < limit_error: LOG.warn('The ceph space is used: %s%%' % ceph_space) # Whe ceph_space Error ,The ceph_space return -1 elif ceph_space < 0: LOG.error('The ceph space check error: Get ceph space Faild') else: LOG.error('The ceph space is used: %s%%' % ceph_space)
def check_cephspace(): # node role check if NODE_ROLE.is_controller(): LOG.info('%s%s Checking ceph space' % ('='*5, '>')) ceph_space = get_ceph_space() limit_war = 83 limit_error = 93 if ceph_space >= 0 and ceph_space < limit_war: LOG.info('The ceph space is used: %s%%' % ceph_space) elif ceph_space >= limit_war and ceph_space < limit_error: LOG.warn('The ceph space is used: %s%%' % ceph_space) # Whe ceph_space Error ,The ceph_space return -1 elif ceph_space < 0: LOG.error('The ceph space check error: Get ceph space Faild') else: LOG.error('The ceph space is used: %s%%' % ceph_space)
def check_nodes(node_role, check_obj, multi_role=False): if multi_role: if LOG.enable_debug: check_cmd = 'sudo eayunstack --debug doctor stack --' + check_obj else: check_cmd = 'sudo eayunstack doctor stack --' + check_obj else: if LOG.enable_debug: check_cmd = 'sudo eayunstack --debug doctor stack --' + check_obj + ' --%s' % node_role else: check_cmd = 'sudo eayunstack doctor stack --' + check_obj + ' --%s' % node_role node_list = get_node_list(node_role) if len(node_list) == 0: LOG.warn('Node list is null !') return proc_list = run_doctor_on_nodes(node_role, node_list, check_cmd) for proc in proc_list: proc.join()
def check_nodes(node_role, check_obj, multi_role=False): if multi_role: if LOG.enable_debug: check_cmd = 'sudo eayunstack --debug doctor stack --' + check_obj else: check_cmd = 'sudo eayunstack doctor stack --' + check_obj else: if LOG.enable_debug: check_cmd = 'sudo eayunstack --debug doctor stack --' + check_obj + ' --%s' % node_role else: check_cmd = 'sudo eayunstack doctor stack --' + check_obj + ' --%s' % node_role node_list = get_node_list(node_role) if len(node_list) == 0: LOG.warn('Node list is null !') return for node in node_list: LOG.info('%s Role: %-10s Node: %-13s %s' % ('*'*15, node_role, node, '*'*15)) # ssh to node and run command ssh_connect2(node, check_cmd)
def check_nodes(node_role, check_obj, multi_role=False): if multi_role: if LOG.enable_debug: check_cmd = 'sudo eayunstack --debug doctor stack --' + check_obj else: check_cmd = 'sudo eayunstack doctor stack --' + check_obj else: if LOG.enable_debug: check_cmd = 'sudo eayunstack --debug doctor stack --' + check_obj + ' --%s' % node_role else: check_cmd = 'sudo eayunstack doctor stack --' + check_obj + ' --%s' % node_role node_list = get_node_list(node_role) if len(node_list) == 0: LOG.warn('Node list is null !') return (proc_list, pipe) = run_doctor_on_nodes(node_role, node_list, check_cmd) for proc in proc_list: proc.join() LOG.info(pipe.recv(), remote=True)
def detach_volume(attached_servers, volume_id, interactive): LOG.info('Detaching volume "%s" .' % volume_id) # check instance was deleted exist_servers = [] for attached_server in attached_servers: sql_get_instance_deleted_status = \ 'SELECT deleted from instances where uuid=\'%s\';' \ % attached_server instance_deleted_status = \ nova_db.connect(sql_get_instance_deleted_status)[0][0] if instance_deleted_status != 0: continue else: exist_servers.append(attached_server) if len(exist_servers) == 0: # if instance was deleted, set volume attach_status to detached if determine_set_volume_to_detached(attached_servers, interactive): LOG.info('Set volume %s attach status to detached' % volume_id) db_set_volume_detached(volume_id) return True else: LOG.warn('Please set volume attach status to "detached" first.') return False if detach_disk_on_compute_node(exist_servers, volume_id): # update database LOG.info(' Updating database.') db_set_volume_detached(volume_id) for server_id in exist_servers: detach_at = time.strftime('%Y-%m-%d %X', time.gmtime()) sql_update_nova_db = 'UPDATE block_device_mapping SET '\ 'deleted_at="%s",deleted=id WHERE '\ 'instance_uuid="%s" and volume_id="%s" '\ 'and deleted=0;'\ % (detach_at, server_id, volume_id) nova_db.connect(sql_update_nova_db) return True else: LOG.warn('Please delete instance "%s" first.' % exist_servers) return False
def check_haproxyresource(): if not NODE_ROLE.is_controller(): LOG.warn('This command can only run on controller node !') return LOG.info('%s%s Checking HAProxy resource status' % ('=' * 5, '>')) monitor_url = get_haproxy_monitor_url() if not monitor_url: LOG.error('Can not get public vip in /etc/astute.yaml!') return monitor_content = get_haproxy_monitor_content(monitor_url) if not monitor_content: return resource_list = csv2dict(monitor_content) def _print_status(log_level='debug'): if check_status: eval('LOG.%s' % log_level)(\ '%s on %s status is %s, check_status is %s.'\ % (pxname, svname, status, check_status)) else: eval('LOG.%s' % log_level)('%s on %s status is %s.'\ % (pxname, svname, status)) for resource in resource_list: pxname = resource['pxname'] svname = resource['svname'] status = resource['status'] check_status = resource['check_status'] if svname == 'FRONTEND': if status == 'OPEN': _print_status() else: _print_status('error') else: if status == 'UP': _print_status() else: _print_status('error')
def check_haproxyresource(): if not NODE_ROLE.is_controller(): LOG.warn('This command can only run on controller node !') return LOG.info('%s%s Checking HAProxy resource status' %('='*5, '>')) monitor_url = get_haproxy_monitor_url() if not monitor_url: LOG.error('Can not get public vip in /etc/astute.yaml!') return monitor_content = get_haproxy_monitor_content(monitor_url) if not monitor_content: return resource_list = csv2dict(monitor_content) def _print_status(log_level='debug'): if check_status: eval('LOG.%s' % log_level)(\ '%s on %s status is %s, check_status is %s.'\ % (pxname, svname, status, check_status)) else: eval('LOG.%s' % log_level)('%s on %s status is %s.'\ % (pxname, svname, status)) for resource in resource_list: pxname = resource['pxname'] svname = resource['svname'] status = resource['status'] check_status = resource['check_status'] if svname == 'FRONTEND': if status == 'OPEN': _print_status() else: _print_status('error') else: if status == 'UP': _print_status() else: _print_status('error')
def check_cpuload(): cpu_processors = get_cpu_processors() if not cpu_processors: LOG.error('Can not get cpu cores!') return # get cpu load limit cpu_load_warn_limit = cpu_processors * 0.7 cpu_load_error_limit = cpu_processors * 0.9 # get cpu load averages(one, five, and fifteen minute averages) cpu_load = get_cpu_load() if not cpu_load: LOG.error('Can not get cpu load!') return # use five minute average to confirm the cpu load status cpu_load_five_minute_average = cpu_load.split(',')[1].strip() if Decimal(cpu_load_five_minute_average) > Decimal(cpu_load_error_limit): LOG.error('Current CPU load averages is : %s. ' 'Please check system status.' % cpu_load) elif Decimal(cpu_load_five_minute_average) > Decimal(cpu_load_warn_limit): LOG.warn('Current CPU load averages is : %s. ' 'Please check system status.' % cpu_load) else: LOG.debug('Current CPU load averages is : %s.' % cpu_load)
def _generate_plugin_repo_conf(plugin_name, file_path, fuel_node_ip): repo_name = plugin_name plugin_version = get_plugin_version(plugin_name) if not plugin_version: LOG.warn('Can not get the version of plugin ' '"%s", skip to generate "repo_name.repo"' % (plugin_name, repo_name)) return baseurl = 'http://' \ + fuel_node_ip \ + ':8080/plugins/' \ + plugin_name \ + '-' \ + plugin_version \ + '/repositories/centos' gpgcheck = 0 repo_conf = '[' + repo_name + ']' + '\n' \ + 'name=' + repo_name + '\n' \ + 'baseurl=' + baseurl + '\n' \ + 'gpgcheck=' + str(gpgcheck) LOG.debug('Generate %s' % file_path) with open(file_path, 'w') as repo_conf_file: repo_conf_file.write(repo_conf)
def check_nodes(node_role, check_obj, multi_role=False): if multi_role: if LOG.enable_debug: check_cmd = 'sudo eayunstack --debug doctor stack --' + check_obj else: check_cmd = 'sudo eayunstack doctor stack --' + check_obj else: if LOG.enable_debug: check_cmd = 'sudo eayunstack --debug doctor stack --' + check_obj + ' --%s' % node_role else: check_cmd = 'sudo eayunstack doctor stack --' + check_obj + ' --%s' % node_role node_list = get_node_list(node_role) if len(node_list) == 0: LOG.warn('Node list is null !') return nodes = [] for node in node_list: node_info = {} node_info['role'] = node_role node_info['name'] = node nodes.append(node_info) result = run_doctor_on_nodes(nodes, check_cmd) for res in result: LOG.info(res, remote=True)