def check_cluster_health() -> None: """ Check cluster status and make sure cluster is healthy """ # Check if cluster running _, _, rc = SimpleCommand().run_cmd(PCS_CLUSTER_STATUS, check_error=False) if rc != 0: raise UpgradeError("Cluster is not running on current node") output, _, _ = SimpleCommand().run_cmd(PCS_FAILCOUNT_STATUS) if "INFINITY" in output: raise UpgradeError( f"Cluster is not stable, some resource are not healthy. {output}")
def backup_consul(filename: str = "consul-kv-dump.json", dst: str = BACKUP_DEST_DIR_CONSUL) -> None: """ Backup Consul KV. Parameters: filename: Consul dump file dst: Directory with to backup Consul dump Return: None Exceptions: UpgradeError """ consul_kv_dump = os.path.join(dst, filename) if os.path.exists(consul_kv_dump): for archive in glob(f"{dst}/*.tar.gz"): os.remove(archive) # Save previous one timestamp = strftime("%Y%m%d%H%M%S", gmtime()) archive_name = os.path.join(dst, f"{consul_kv_dump}.{timestamp}.tar.gz") Log.info(f"Backup existing {consul_kv_dump} to {archive_name}") with tarfile.open(archive_name, "w:gz") as tar: tar.add(consul_kv_dump) else: os.makedirs(dst, exist_ok=True) consul_export_cmd = "consul kv export > {}".format(shlex.quote(consul_kv_dump)) cp = subprocess.run(consul_export_cmd, shell=True, stderr=subprocess.PIPE) if cp.returncode: raise UpgradeError("Consul export failed with error {}".format(cp.stderr.decode()))
def _yaml_to_dict(yaml_file=None): ''' Convert yaml format key value info in the form of python dictionary key value ''' if yaml_file is None: raise UpgradeError('yaml file path can not be None. Please provide the \ HA yaml conf file path for conversion') with open(yaml_file, 'r') as conf_file: file_as_dict = yaml.safe_load(conf_file) return file_as_dict
def _check_for_any_resource_presence() -> None: '''Check if any resources are already present in a cluster. if yes, means, pre-upgrade steps failed. hence exit''' Log.info('Check for any resource presence in a cluster') root = _get_cib_xml() resource_list = [e.attrib["id"] for e in root.findall(".//lrm_resource") if "id" in e.attrib] if resource_list: raise UpgradeError('Some resources are already present in the cluster. \ Perform Upgrade process again')
def _switch_cluster_mode(cluster_mode, retry_count=0) -> None: ''' Perform cluster operation to change the mode such as standby or unstandby and also retries the operation ''' try: cluster_switch_mode_command = cluster_mode + f' --wait={CLUSTER_STANDBY_UNSTANDBY_TIMEOUT}' SimpleCommand().run_cmd(cluster_switch_mode_command) except Exception as err: if retry_count != 3: retry_count += 1 _switch_cluster_mode(cluster_mode, retry_count) raise UpgradeError('Failed to switch the mode of the cluster. \ Retry upgrade again') from err
def cluster_standby_mode() -> None: """ Put cluster to standby mode. Note: this function may be replaced by Cluster Manager call. Exceptions: UpgradeError """ Log.info("Set cluster to standby mode") standby_cmd = "pcs node standby --all --wait=600" try: SimpleCommand().run_cmd(standby_cmd) except Exception as err: raise UpgradeError("Cluster standby operation failed") from err
def is_resource_deleted(timeout) -> None: """ Check if pre disruptive upgrade is successful """ base_wait = 5 while timeout > 0: resources = _get_resource_list() Log.info( f"Waiting for {str(timeout)} to delete resources {resources}.") if len(resources) == 0: Log.info("All resource deleted successfully.") break time.sleep(base_wait) timeout -= base_wait resources = _get_resource_list() if len(resources) != 0: raise UpgradeError( f"Failed to delete resource. Remaining resources {resources} ...")
def cluster_standby_mode() -> None: """ Put cluster to standby mode. Note: this function may be replaced by Cluster Manager call. Exceptions: UpgradeError """ Log.info("Set cluster to standby mode") Log.info("Please wait, standby can take max 20 to 30 min.") standby_cmd = f"{PCS_CLUSTER_STANDBY} --wait=1800" try: SimpleCommand().run_cmd(standby_cmd) except Exception as err: raise UpgradeError("Cluster standby operation failed")
def _load_config(ha_source_conf: str = SOURCE_CONFIG_FILE, \ ha_backup_conf: str = BACKUP_CONFIG_FILE) -> None: ''' Load the new config at proper location after the RPM upgrade as part of post-upgrade process ''' dest_dir = CONFIG_DIR new_src_dir = SOURCE_CONFIG_PATH # Convert yaml to dictionary old_backup_conf_dict = _yaml_to_dict(ha_backup_conf) new_conf_dict = _yaml_to_dict(ha_source_conf) # Note: There are 3 scenarios for conf file upgrade # 1. New conf key-value pair can be introduced # 2. Already present conf key can be updated with new value # 3. A conf key-value can be deleted # Here, we are considering or assuming that there will not be updation. # Upgrade means a new key value will be added. 2nd scenario can be handled # or needs to be handled seperately. # If key will be deleted after upgrade and still the conf will be loaded with that # key, it will not affect the functionality because, that key will not be in use. # Update the old dictionary with new one # This update will also update the values if they got changed in new # version. This is not considered right now, hence update can be safely used. # So, handling only first scenario here. old_backup_conf_dict.update(new_conf_dict) # Finally, update the old config file with new changes with open(ha_backup_conf, 'w') as outfile: yaml.dump(old_backup_conf_dict, outfile, default_flow_style=False) try: # Finally copy the updated backup conf file to a source copyfile(ha_backup_conf, ha_source_conf) # At last, copy the whole source directory which has updated # conf to a desired location if os.path.exists(new_src_dir) and os.listdir(new_src_dir): copystat(new_src_dir, dest_dir) except Exception as err: raise UpgradeError('Failed to load the new config after \ upgrading the RPM. Please retry Upgrade process again' ) \ from err
def delete_resources() -> None: """ Delete pacemaker resources. Exceptions: UpgradeError """ try: root = _get_cib_xml() resources = [e.attrib["id"] for e in root.findall(".//lrm_resource") if "id" in e.attrib] Log.info(f"Going to delete following resources: {resources}") for r in resources: Log.info(f"Deleting {r}") SimpleCommand().run_cmd(f"pcs resource delete {r}") except Exception as err: raise UpgradeError("Resource deletion failed") from err
def delete_resources() -> None: """ Delete pacemaker resources. Exceptions: UpgradeError """ try: resources = _get_resource_list() Log.info(f"Going to delete following resources: {resources}") for r in resources: Log.info(f"Deleting resource {r}") SimpleCommand().run_cmd( PCS_DELETE_RESOURCE.replace("<resource>", r)) SimpleCommand().run_cmd(PCS_CLEANUP) Log.info("Wait 2 min till all resource deleted.") is_resource_deleted(120) except Exception as err: raise UpgradeError("Resource deletion failed")
def backup_configuration(src: str = CONFIG_DIR, dst: str = BACKUP_DEST_DIR_CONF) -> None: """ Backup HA configuration. Parameters: src: HA configs location dst: Directory with HA config backup Return: None Exceptions: UpgradeError """ Log.info(f"Backup HA configuration from {src} to {dst}") try: if os.path.exists(dst): rmtree(dst) copytree(src, dst) except Exception as err: raise UpgradeError("Failed to create backup of HA config") from err