def get_cert_info(cert_id, con_ssh=None): LOG.info('check the status of the current certificate') cmd = 'certificate-show ' + cert_id output = cli.system(cmd, ssh_client=con_ssh, fail_ok=False)[1] if output: table = table_parser.table(output) if table: actual_id = table_parser.get_value_two_col_table(table, 'uuid') actual_type = table_parser.get_value_two_col_table( table, 'certtype') actual_details = table_parser.get_value_two_col_table( table, 'details') actual_states = '' if not actual_details: # CGTS-9529 LOG.fatal('No details in output of certificate-show') LOG.fatal( 'Ignore it until the known issue CGTS-9529 fixed, output:' + output) # assert False, 'No details in output of certificate-show' else: LOG.debug('details from output of certificate-show: {}'.format( actual_details)) actual_states = eval(actual_details) LOG.debug('states: {}'.format(actual_states)) return 0, actual_id, actual_type, actual_states LOG.info('') return 1, actual_id, actual_type, actual_states else: LOG.info('no "details" in output') return 2, '', '', ''
def test_restore(restore_setup): controller1 = 'controller-1' controller0 = 'controller-0' lab = restore_setup["lab"] is_aio_lab = lab.get('system_type', 'Standard') == 'CPE' is_sx = is_aio_lab and (len(lab['controller_nodes']) < 2) tis_backup_files = restore_setup['tis_backup_files'] backup_src = RestoreVars.get_restore_var('backup_src'.upper()) backup_src_path = RestoreVars.get_restore_var('backup_src_path'.upper()) controller_node = lab[controller0] con_ssh = ControllerClient.get_active_controller(name=lab['short_name'], fail_ok=True) sys_prompt = Prompt.TIS_NODE_PROMPT_BASE.format('.*' + lab['name'].split('_')[0]) controller_prompt = '{}|{}'.format(sys_prompt, Prompt.CONTROLLER_0) controller_node.telnet_conn.set_prompt(controller_prompt) if not con_ssh: LOG.info("Establish ssh connection with {}".format(controller0)) controller_node.ssh_conn = install_helper.ssh_to_controller( controller_node.host_ip, initial_prompt=controller_prompt) controller_node.ssh_conn.deploy_ssh_key() con_ssh = controller_node.ssh_conn ControllerClient.set_active_controller(con_ssh) LOG.info("Restore system from backup....") system_backup_file = [ file for file in tis_backup_files if "system.tgz" in file ].pop() images_backup_file = [ file for file in tis_backup_files if "images.tgz" in file ].pop() LOG.tc_step("Restoring {}".format(controller0)) LOG.info("System config restore from backup file {} ...".format( system_backup_file)) if backup_src.lower() == 'usb': system_backup_path = "{}/{}".format(BackupRestore.USB_BACKUP_PATH, system_backup_file) else: system_backup_path = "{}{}".format(HostLinuxUser.get_home(), system_backup_file) compute_configured = install_helper.restore_controller_system_config( system_backup=system_backup_path, is_aio=is_aio_lab)[2] # return LOG.info('re-connect to the active controller using ssh') con_ssh.close() controller_node.ssh_conn = install_helper.ssh_to_controller( controller_node.host_ip, initial_prompt=controller_prompt) LOG.info("Source Keystone user admin environment ...") LOG.info("set prompt to:{}, telnet_conn:{}".format( controller_prompt, controller_node.telnet_conn)) controller_node.telnet_conn.exec_cmd("cd; source /etc/platform/openrc") con_ssh = install_helper.ssh_to_controller(controller_node.host_ip) controller_node.ssh_conn = con_ssh ControllerClient.set_active_controller(con_ssh) make_sure_all_hosts_locked(con_ssh) if backup_src.lower() == 'local': images_backup_path = "{}{}".format(HostLinuxUser.get_home(), images_backup_file) common.scp_from_test_server_to_active_controller( "{}/{}".format(backup_src_path, images_backup_file), HostLinuxUser.get_home()) else: images_backup_path = "{}/{}".format(BackupRestore.USB_BACKUP_PATH, images_backup_file) LOG.info( "Images restore from backup file {} ...".format(images_backup_file)) new_prompt = r'{}.*~.*\$ |controller\-0.*~.*\$ '.format( lab['name'].split('_')[0]) LOG.info('set prompt to:{}'.format(new_prompt)) con_ssh.set_prompt(new_prompt) install_helper.restore_controller_system_images( images_backup=images_backup_path, tel_net_session=controller_node.telnet_conn) # this is a workaround for CGTS-8190 install_helper.update_auth_url(con_ssh) LOG.tc_step( "Verifying restoring controller-0 is complete and is in available state ..." ) LOG.debug('Wait for system ready in 60 seconds') time.sleep(60) timeout = HostTimeout.REBOOT + 60 availability = HostAvailState.AVAILABLE is_available = system_helper.wait_for_hosts_states( controller0, availability=HostAvailState.AVAILABLE, fail_ok=True, timeout=timeout) if not is_available: LOG.warn( 'After {} seconds, the first node:{} does NOT reach {}'.format( timeout, controller0, availability)) LOG.info('Check if drbd is still synchronizing data') con_ssh.exec_sudo_cmd('drbd-overview') is_degraded = system_helper.wait_for_hosts_states( controller0, availability=HostAvailState.DEGRADED, fail_ok=True, timeout=300) if is_degraded: LOG.warn('Node: {} is degraded: {}'.format( controller0, HostAvailState.DEGRADED)) con_ssh.exec_sudo_cmd('drbd-overview') else: LOG.fatal('Node:{} is NOT in Available nor Degraded status') # the customer doc does have wording regarding this situation, continue # assert False, 'Node:{} is NOT in Available nor Degraded status' # delete the system backup files from sysadmin home LOG.tc_step("Copying backup files to /opt/backups ... ") if backup_src.lower() == 'local': con_ssh.exec_cmd("rm -f {} {}".format(system_backup_path, images_backup_path)) cmd_rm_known_host = r'sed -i "s/^[^#]\(.*\)"/#\1/g /etc/ssh/ssh_known_hosts; \sync' con_ssh.exec_sudo_cmd(cmd_rm_known_host) # transfer all backup files to /opt/backups from test server with con_ssh.login_as_root(): con_ssh.scp_on_dest(source_user=TestFileServer.get_user(), source_ip=TestFileServer.get_server(), source_pswd=TestFileServer.get_password(), source_path=backup_src_path + "/*", dest_path=StxPath.BACKUPS + '/', timeout=1200) else: # copy all backupfiles from USB to /opt/backups cmd = " cp {}/* {}".format(BackupRestore.USB_BACKUP_PATH, StxPath.BACKUPS) con_ssh.exec_sudo_cmd(cmd, expect_timeout=600) LOG.tc_step("Checking if backup files are copied to /opt/backups ... ") assert int(con_ssh.exec_cmd("ls {} | wc -l".format(StxPath.BACKUPS))[1]) >= 2, \ "Missing backup files in {}".format(StxPath.BACKUPS) if is_aio_lab: LOG.tc_step("Restoring Cinder Volumes ...") restore_volumes() LOG.tc_step('Run restore-complete (CGTS-9756)') cmd = 'echo "{}" | sudo -S config_controller --restore-complete'.format( HostLinuxUser.get_password()) controller_node.telnet_conn.login() controller_node.telnet_conn.exec_cmd( cmd, extra_expects=[' will reboot on completion']) LOG.info('- wait untill reboot completes, ') time.sleep(120) LOG.info('- confirm the active controller is actually back online') controller_node.telnet_conn.login() LOG.tc_step( "reconnecting to the active controller after restore-complete") con_ssh = install_helper.ssh_to_controller(controller_node.host_ip) if not compute_configured: LOG.tc_step( 'Latest 18.07 EAR1 or Old-load on AIO/CPE lab: config its ' 'compute functionalities') # install_helper.run_cpe_compute_config_complete(controller_node, controller0) # LOG.info('closing current ssh connection') # con_ssh.close() LOG.tc_step('Run restore-complete (CGTS-9756)') controller_node.telnet_conn.login() cmd = 'echo "{}" | sudo -S config_controller --restore-complete'.\ format(HostLinuxUser.get_password()) controller_node.telnet_conn.exec_cmd(cmd, extra_expects=' will reboot ') controller_node.telnet_conn.close() LOG.info( 'Wait until "config_controller" reboot the active controller') time.sleep(180) controller_node.telnet_conn = install_helper.open_telnet_session( controller_node) controller_node.telnet_conn.login() time.sleep(120) con_ssh = install_helper.ssh_to_controller(controller_node.host_ip) controller_node.ssh_conn = con_ssh ControllerClient.set_active_controller(con_ssh) host_helper.wait_for_hosts_ready(controller0) LOG.tc_step('Install the standby controller: {}'.format(controller1)) if not is_sx: install_non_active_node(controller1, lab) elif len(lab['controller_nodes']) >= 2: LOG.tc_step('Install the standby controller: {}'.format(controller1)) install_non_active_node(controller1, lab) boot_interfaces = lab['boot_device_dict'] hostnames = system_helper.get_hosts() storage_hosts = [host for host in hostnames if 'storage' in host] compute_hosts = [ host for host in hostnames if 'storage' not in host and 'controller' not in host ] if len(storage_hosts) > 0: # con_ssh.exec_sudo_cmd('touch /etc/ceph/ceph.client.None.keyring') for storage_host in storage_hosts: LOG.tc_step("Restoring {}".format(storage_host)) install_helper.open_vlm_console_thread( storage_host, boot_interface=boot_interfaces, vlm_power_on=True) LOG.info( "Verifying {} is Locked, Diabled and Online ...".format( storage_host)) system_helper.wait_for_hosts_states( storage_host, administrative=HostAdminState.LOCKED, operational=HostOperState.DISABLED, availability=HostAvailState.ONLINE) LOG.info("Unlocking {} ...".format(storage_host)) rc, output = host_helper.unlock_host(storage_host, available_only=True) assert rc == 0, "Host {} failed to unlock: rc = {}, msg: {}".format( storage_host, rc, output) LOG.info("Veryifying the Ceph cluster is healthy ...") storage_helper.wait_for_ceph_health_ok(timeout=600) LOG.info("Importing images ...") image_backup_files = install_helper.get_backup_files( IMAGE_BACKUP_FILE_PATTERN, StxPath.BACKUPS, con_ssh) LOG.info("Image backup found: {}".format(image_backup_files)) imported = install_helper.import_image_from_backup( image_backup_files) LOG.info("Images successfully imported: {}".format(imported)) LOG.tc_step("Restoring Cinder Volumes ...") restore_volumes() LOG.tc_step('Run restore-complete (CGTS-9756), regular lab') controller_node.telnet_conn.login() cmd = 'echo "{}" | sudo -S config_controller --restore-complete'.format( HostLinuxUser.get_password()) controller_node.telnet_conn.exec_cmd( cmd, extra_expects='controller-0 login:'******'rebuild ssh connection') con_ssh = install_helper.ssh_to_controller(controller_node.host_ip) controller_node.ssh_conn = con_ssh LOG.tc_step("Restoring Compute Nodes ...") if len(compute_hosts) > 0: for compute_host in compute_hosts: LOG.tc_step("Restoring {}".format(compute_host)) install_helper.open_vlm_console_thread( compute_host, boot_interface=boot_interfaces, vlm_power_on=True) LOG.info( "Verifying {} is Locked, Diabled and Online ...".format( compute_host)) system_helper.wait_for_hosts_states( compute_host, administrative=HostAdminState.LOCKED, operational=HostOperState.DISABLED, availability=HostAvailState.ONLINE) LOG.info("Unlocking {} ...".format(compute_host)) rc, output = host_helper.unlock_host(compute_host, available_only=True) assert rc == 0, "Host {} failed to unlock: rc = {}, msg: {}".format( compute_host, rc, output) LOG.info("All nodes {} are restored ...".format(hostnames)) else: LOG.warn('Only 1 controller, but not AIO lab!!??') LOG.tc_step("Delete backup files from {} ....".format(StxPath.BACKUPS)) con_ssh.exec_sudo_cmd("rm -rf {}/*".format(StxPath.BACKUPS)) LOG.tc_step('Perform post-restore testing/checking') post_restore_test(con_ssh) LOG.tc_step("Waiting until all alarms are cleared ....") timeout = 300 healthy, alarms = system_helper.wait_for_all_alarms_gone(timeout=timeout, fail_ok=True) if not healthy: LOG.warn('Alarms exist: {}, after waiting {} seconds'.format( alarms, timeout)) rc, message = con_ssh.exec_sudo_cmd('drbd-overview') if rc != 0 or (r'[===>' not in message and r'] sync\'ed: ' not in message): LOG.warn('Failed to get drbd-overview information') LOG.info('Wait for the system to be ready in {} seconds'.format( HostTimeout.REBOOT)) system_helper.wait_for_all_alarms_gone(timeout=HostTimeout.REBOOT, fail_ok=False) LOG.tc_step("Verifying system health after restore ...") rc, failed = system_helper.get_system_health_query(con_ssh=con_ssh) assert rc == 0, "System health not OK: {}".format(failed) collect_logs()
def perform_vm_operation(vm_type, vm_id, op='live_migration', extra_specs='vtpm'): LOG.info('Perform action:{} to the VM, extra specs:{}'.format( op, extra_specs)) op_table = { 'live_migration': lambda x, y: vm_helper.live_migrate_vm(y), 'cold_migration': lambda x, y: vm_helper.cold_migrate_vm(y), 'stop_start': lambda x, y: (vm_helper.stop_vms(y), vm_helper.start_vms(y)), 'suspend_resume': lambda x, y: (vm_helper.suspend_vm(y), vm_helper.resume_vm(y)), 'pause_unpause': lambda x, y: (vm_helper.pause_vm(y), vm_helper.unpause_vm(y)), 'reboot_host': lambda x, y: reboot_hosting_node(x, y, force_reboot=False), 'soft_reboot': lambda x, y: vm_helper.reboot_vm(y, hard=False), 'hard_reboot': lambda x, y: vm_helper.reboot_vm(y, hard=True), 'lock_unlock': lambda x, y: lock_unlock_hosting_node(x, y, force_lock=False), 'evacuate': lambda x, y: reboot_hosting_node(x, y, force_reboot=True), } if op in op_table: LOG.info('Perform action: {}'.format(op)) op_table[op](vm_type, vm_id) return True elif op == 'resize_to_autorc': if vm_type == 'autorc': LOG.info( 'resize from AUTO-RECOVERY to another AUTO-RECOVER flavor') to_flavor_id = get_flavor_id(vm_type, 'autorc2') LOG.info('TODO: {}, m_type={}, to_flavor_id={}'.format( to_flavor_id, vm_type, to_flavor_id)) vm_helper.resize_vm(vm_id, to_flavor_id) elif op == 'resize_to_non_autorc': LOG.info('perform {} on type:{}, id:{}'.format(op, vm_type, vm_id)) if vm_type == 'non_autorc2': LOG.warn( 'resize from AUTO-RECOVERY to another AUTO-RECOVER flavor') to_flavor_id = get_flavor_id(vm_type, 'non_autorc2') vm_helper.resize_vm(vm_id, to_flavor_id) elif op == 'resize_to_non_vtpm': LOG.info('perform {} on type:{}, id:{}'.format(op, vm_type, vm_id)) to_flavor_id = get_flavor_id(vm_type, 'non_vtpm') vm_helper.resize_vm(vm_id, to_flavor_id) else: LOG.fatal('Unsupported action: {}'.format(op)) return False