def upgrade_controller(from_release, to_release): """ Executed on the release N+1 side upgrade controller-1. """ if from_release == to_release: raise Exception("Cannot upgrade from release %s to the same " "release %s." % (from_release, to_release)) devnull = open(os.devnull, 'w') LOG.info("Upgrading controller from %s to %s" % (from_release, to_release)) # Stop sysinv-agent so it doesn't interfere LOG.info("Stopping sysinv-agent") try: subprocess.check_call(["systemctl", "stop", "sysinv-agent"], stdout=devnull) except subprocess.CalledProcessError: LOG.error("Failed to stop %s service" % "sysinv-agent") raise # Mount required filesystems from mate controller LOG.info("Mounting filesystems") nfs_mount_filesystem(PLATFORM_PATH) nfs_mount_filesystem(utils.RABBIT_PATH) os.mkdir(POSTGRES_MOUNT_PATH) nfs_mount_filesystem(utils.POSTGRES_PATH, POSTGRES_MOUNT_PATH) # Migrate keyring data print("Migrating keyring data...") migrate_keyring_data(from_release, to_release) # Migrate pxeboot config print("Migrating pxeboot configuration...") migrate_pxeboot_config(from_release, to_release) # Migrate armada config print("Migrating armada configuration...") migrate_armada_config(from_release, to_release) # Migrate helm config print("Migrating helm configuration...") migrate_helm_config(from_release, to_release) # Migrate sysinv data. print("Migrating sysinv configuration...") migrate_sysinv_data(from_release, to_release) # Prepare for database migration print("Preparing for database migration...") prepare_postgres_filesystems() # Create the postgres database create_database() # Start the postgres server try: subprocess.check_call(['sudo', '-u', 'postgres', 'pg_ctl', '-D', utils.POSTGRES_DATA_DIR, 'start'], stdout=devnull) except subprocess.CalledProcessError: LOG.exception("Failed to start postgres service") raise # Wait for postgres to start # TODO: Make this deterministic (use wait_service?) time.sleep(5) # Import databases print("Importing databases...") import_databases(from_release, to_release) role = get_system_role() shared_services = get_shared_services() # Create /tmp/python_keyring - used by keystone manifest. shutil.copytree(os.path.join(PLATFORM_PATH, ".keyring", to_release, "python_keyring"), "/tmp/python_keyring") # Copy admin.conf file from /opt/platform to /etc/kubernetes/admin.conf # during upgrade try: subprocess.check_call( ["cp", os.path.join(PLATFORM_PATH, "config", to_release, "kubernetes", utils.KUBERNETES_ADMIN_CONF_FILE), os.path.join(utils.KUBERNETES_CONF_PATH, utils.KUBERNETES_ADMIN_CONF_FILE)], stdout=devnull) except subprocess.CalledProcessError: LOG.exception("Failed to copy %s" % os.path.join(utils.KUBERNETES_CONF_PATH, utils.KUBERNETES_ADMIN_CONF_FILE)) raise # Migrate hiera data migrate_hiera_data(from_release, to_release, role=role) utils.add_upgrade_entries_to_hiera_data(from_release) # Get database credentials db_credentials = get_db_credentials( shared_services, from_release, role=role) # Create any new databases print("Creating new databases...") create_databases(from_release, to_release, db_credentials) print("Migrating databases...") # Migrate sysinv database migrate_sysinv_database() # Migrate databases migrate_databases(from_release, shared_services, db_credentials, role=role) print("Applying configuration...") # Execute migration scripts utils.execute_migration_scripts( from_release, to_release, utils.ACTION_MIGRATE) uuid = get_controller_1_uuid() update_platform_conf_file(uuid) # Stop postgres server try: subprocess.check_call(['sudo', '-u', 'postgres', 'pg_ctl', '-D', utils.POSTGRES_DATA_DIR, 'stop'], stdout=devnull) except subprocess.CalledProcessError: LOG.exception("Failed to stop postgres service") raise # Apply "upgrades" manifest LOG.info("Applying upgrades manifest") myip = gethostaddress(utils.CONTROLLER_1_HOSTNAME) utils.apply_upgrade_manifest(myip) # Remove manifest and keyring files shutil.rmtree("/tmp/puppet") shutil.rmtree("/tmp/python_keyring") # Generate config to be used by "regular" manifest LOG.info("Generating config for %s" % utils.CONTROLLER_1_HOSTNAME) try: cutils.create_system_config() cutils.create_host_config(utils.CONTROLLER_1_HOSTNAME) except Exception as e: LOG.exception(e) LOG.info("Failed to update hiera configuration") raise # Remove /etc/kubernetes/admin.conf after it is used to generate # the hiera data admin_conf = os.path.join(utils.KUBERNETES_CONF_PATH, utils.KUBERNETES_ADMIN_CONF_FILE) try: subprocess.check_call(["rm -f %s" % admin_conf], shell=True, stdout=devnull) except subprocess.CalledProcessError: LOG.exception("Failed to remove file %s" % admin_conf) print("Shutting down upgrade processes...") # Stop postgres service LOG.info("Stopping postgresql service") try: subprocess.check_call(["systemctl", "stop", "postgresql"], stdout=devnull) except subprocess.CalledProcessError: LOG.exception("Failed to stop postgresql service") raise # Stop rabbitmq-server service LOG.info("Stopping rabbitmq-server service") try: subprocess.check_call(["systemctl", "stop", "rabbitmq-server"], stdout=devnull) except subprocess.CalledProcessError: LOG.exception("Failed to stop rabbitmq-server service") raise # Copy upgraded database back to controller-0 print("Writing upgraded databases...") LOG.info("Copying upgraded database to controller-0") try: subprocess.check_call( ["cp", "-a", os.path.join(utils.POSTGRES_PATH, to_release), os.path.join(POSTGRES_MOUNT_PATH, to_release)], stdout=devnull) except subprocess.CalledProcessError: LOG.exception( "Failed to copy migrated postgres database to controller-0") raise # Remove temporary filesystems remove_temp_filesystem("cgts-vg", "dbdump-temp-lv", POSTGRES_DUMP_MOUNT_PATH) remove_temp_filesystem("cgts-vg", "postgres-temp-lv", utils.POSTGRES_PATH) # Remove mounts LOG.info("Removing mounts") unmount_filesystem(PLATFORM_PATH) unmount_filesystem(utils.RABBIT_PATH) unmount_filesystem(POSTGRES_MOUNT_PATH) os.rmdir(POSTGRES_MOUNT_PATH) # Set upgrade flags on mate controller LOG.info("Setting upgrade flags on mate controller") os.mkdir("/tmp/etc_platform") nfs_mount_filesystem("/etc/platform", "/tmp/etc_platform") upgrade_complete_flag_file = os.path.join( "/tmp/etc_platform", os.path.basename(CONTROLLER_UPGRADE_COMPLETE_FLAG)) open(upgrade_complete_flag_file, "w").close() upgrade_flag_file = os.path.join( "/tmp/etc_platform", os.path.basename(CONTROLLER_UPGRADE_FLAG)) os.remove(upgrade_flag_file) upgrade_complete_flag_file = os.path.join( "/tmp/etc_platform", os.path.basename(CONTROLLER_UPGRADE_STARTED_FLAG)) os.remove(upgrade_complete_flag_file) unmount_filesystem("/tmp/etc_platform") os.rmdir("/tmp/etc_platform") # Restart the sysinv agent to report the inventory status # The sysinv.conf contains temporary parameters that are used for # data-migration. By removing that sysinv.conf we trigger the sysinv-agent # to load the correct conf from the drbd filesystem os.remove("/etc/sysinv/sysinv.conf") LOG.info("Starting sysinv-agent") cutils.start_service("sysinv-agent") print("Controller-1 upgrade complete") LOG.info("Controller-1 upgrade complete!!!")
def upgrade_controller_simplex(backup_file): """ Performs the upgrade on controller-0. Broadly this is system restore combined with the upgrade data migration We extract the data from the archive, restore the database to a temporary filesystem, migrate the data and generate the N+1 manifests. The migrated database is dumped to /opt/backups. We apply the N+1 manifests as INITIAL_CONFIG_PRIMARY and then restore the migrated database. Finally we apply any necessary upgrade manifests and restore the rest of the system data. """ if (os.path.exists(constants.CGCS_CONFIG_FILE) or os.path.exists(CONFIG_PATH) or os.path.exists(constants.INITIAL_CONFIG_COMPLETE_FILE)): print_log_info("Configuration has already been done. " "An upgrade operation can only be done " "immediately after the load has been installed.") raise Exception("System configuration already completed") if not os.path.isfile(backup_file): raise Exception("Backup file (%s) not found." % backup_file) if not os.path.isabs(backup_file): backup_file = os.path.abspath(backup_file) if os.path.isfile(RESTORE_IN_PROGRESS_FLAG): raise Exception("Upgrade already in progress.") else: open(RESTORE_IN_PROGRESS_FLAG, 'w') devnull = open(os.devnull, 'w') print_log_info("Starting controller upgrade") staging_dir = tempfile.mkdtemp(dir='/tmp') # Permission change required or postgres restore fails subprocess.call(['chmod', 'a+rx', staging_dir], stdout=devnull) os.chdir('/') try: archive = tarfile.open(backup_file) except tarfile.TarError as e: LOG.exception(e) raise Exception("Error opening backup file. Invalid backup file.") metadata = get_simplex_metadata(archive, staging_dir) from_release = metadata['upgrade']['from_release'] to_release = metadata['upgrade']['to_release'] check_load_version(to_release) # TODO: Switch this over to use Ansible # backup_restore.check_load_subfunctions(archive, staging_dir) # Patching is potentially a multi-phase step. # If the controller is impacted by patches from the backup, # it must be rebooted before continuing the restore. # If this is the second pass through, we can skip over this. if not os.path.isfile(restore_patching_complete): print("Restoring Patches") extract_relative_directory(archive, "patching", patching_permdir) extract_relative_directory(archive, "updates", patching_repo_permdir) print("Applying Patches") try: subprocess.check_output(["sw-patch", "install-local"]) except subprocess.CalledProcessError: LOG.error("Failed to install patches") raise Exception("Failed to install patches") open(restore_patching_complete, 'w') # If the controller was impacted by patches, we need to reboot. if os.path.isfile(node_is_patched): LOG.info("This controller has been patched. Rebooting now") print("\nThis controller has been patched. Rebooting now\n\n") time.sleep(5) os.remove(RESTORE_IN_PROGRESS_FLAG) if staging_dir: shutil.rmtree(staging_dir, ignore_errors=True) subprocess.call("reboot") else: # We need to restart the patch controller and agent, since # we setup the repo and patch store outside its control subprocess.call( ["systemctl", "restart", "sw-patch-controller-daemon.service"], stdout=devnull, stderr=devnull) subprocess.call( ["systemctl", "restart", "sw-patch-agent.service"], stdout=devnull, stderr=devnull) if os.path.isfile(node_is_patched): # If we get here, it means the node was patched by the user # AFTER the restore applied patches and rebooted, but didn't # reboot. # This means the patch lineup no longer matches what's in the # backup, but we can't (and probably shouldn't) prevent that. # However, since this will ultimately cause the node to fail # the goenabled step, we can fail immediately and force the # user to reboot. print_log_info("\nThis controller has been patched, but not rebooted.") print_log_info("Please reboot before continuing the restore process.") raise Exception("Controller node patched without rebooting") # Flag can now be cleared os.remove(restore_patching_complete) if from_release == to_release: raise Exception("Cannot upgrade from release %s to the same " "release %s." % (from_release, to_release)) # TODO Use db_fs_size from yaml data and add to runtime parameters # during the bootstrap manifest # db_size = metadata['filesystem']['database_gib'] # db_bytes = db_size * 1024 * 1024 * 1024 # db_filesystem_size = str(db_bytes) + "B" # Stop sysinv-agent so it doesn't interfere LOG.info("Stopping sysinv-agent") try: subprocess.check_call(["systemctl", "stop", "sysinv-agent"], stdout=devnull) except subprocess.CalledProcessError: LOG.error("Failed to stop %s service" % "sysinv-agent") raise print_log_info("Extracting data from archive") extract_data_from_archive(archive, staging_dir, from_release, to_release) migrate_platform_conf(staging_dir) # Migrate keyring data print_log_info("Migrating keyring data...") migrate_keyring_data(from_release, to_release) # Migrate pxeboot config print_log_info("Migrating pxeboot configuration...") migrate_pxeboot_config(from_release, to_release) # Migrate sysinv data. print_log_info("Migrating sysinv configuration...") migrate_sysinv_data(from_release, to_release) # Simplex configurations can not have shared services shared_services = [] # Migrate hiera data migrate_hiera_data(from_release, to_release) db_credentials = get_db_credentials(shared_services, from_release) os.unlink(PLATFORM_PATH) # Write the simplex flag cutils.write_simplex_flag() cutils.configure_hostname('controller-0') controller_0_address = cutils.get_address_from_hosts_file( 'controller-0') hieradata_tmpdir = os.path.join(staging_dir, constants.HIERADATA_PERMDIR.strip('/')) print_log_info("Applying Bootstrap manifest...") cutils.apply_manifest(controller_0_address, sysinv_constants.CONTROLLER, 'bootstrap', hieradata_tmpdir) persist_platform_data(staging_dir) cutils.stop_service("sysinv-agent") cutils.stop_service("sysinv-api") cutils.stop_service("sysinv-conductor") cutils.stop_service("openstack-keystone") extract_postgres_data(archive) # Import databases print_log_info("Importing databases...") import_databases(from_release, to_release, utils.POSTGRES_PATH, simplex=True) # Create any new databases print_log_info("Creating new databases...") create_databases(from_release, to_release, db_credentials) print_log_info("Migrating databases...") # Migrate sysinv database migrate_sysinv_database() # Migrate databases migrate_databases(from_release, shared_services, db_credentials, simplex=True) print_log_info("Applying configuration...") # Execute migration scripts utils.execute_migration_scripts( from_release, to_release, utils.ACTION_MIGRATE) # Generate "regular" manifests LOG.info("Generating manifests for %s" % sysinv_constants.CONTROLLER_0_HOSTNAME) # TODO: Switch this over to use Ansible # backup_restore.configure_loopback_interface(archive) print_log_info("Creating configs...") cutils.create_system_config() cutils.create_host_config() print_log_info("Persisting Data") cutils.start_service("openstack-keystone") cutils.start_service("sysinv-conductor") cutils.start_service("sysinv-api") cutils.start_service("sysinv-agent") runtime_filename = os.path.join(staging_dir, 'runtime.yaml') utils.create_simplex_runtime_config(runtime_filename) if not os.path.isfile(runtime_filename): # There is no runtime yaml file to apply runtime_filename = None print_log_info("Applying manifest...") cutils.apply_manifest(controller_0_address, sysinv_constants.CONTROLLER, 'controller', constants.HIERADATA_PERMDIR, runtime_filename=runtime_filename) cutils.persist_config() cutils.apply_banner_customization() # TODO: Switch this over to use Ansible # backup_restore.restore_ldap(archive, backup_restore.ldap_permdir, # staging_dir) # backup_restore.restore_std_dir(archive, backup_restore.home_permdir) archive.close() shutil.rmtree(staging_dir, ignore_errors=True) cutils.mtce_restart() cutils.mark_config_complete() print_log_info("Waiting for services to start") for service in ['sysinv-conductor', 'sysinv-inv']: if not cutils.wait_sm_service(service): raise Exception("Services have failed to initialize.") os.remove(RESTORE_IN_PROGRESS_FLAG) # Create the flag file that permits the # restore_compute command option. cutils.touch(restore_compute_ready) print_log_info("Data restore complete")