def latest_checkpoint_location(self): """Returns checkpoint location for the cleanly shut down primary""" data = self.controldata() lsn = data.get('Latest checkpoint location') if data.get('Database cluster state') == 'shut down' and lsn: try: return str(parse_lsn(lsn)) except (IndexError, ValueError) as e: logger.error('Exception when parsing lsn %s: %r', lsn, e)
def _check_timeline_and_lsn(self, leader): local_timeline, local_lsn = self._get_local_timeline_lsn() if local_timeline is None or local_lsn is None: return if isinstance(leader, Leader): if leader.member.data.get('role') != 'master': return # standby cluster elif not self.check_leader_is_not_in_recovery( **leader.conn_kwargs(self._postgresql.config.replication)): return history = need_rewind = None try: with self._postgresql.get_replication_connection_cursor( **leader.conn_kwargs()) as cur: cur.execute('IDENTIFY_SYSTEM') master_timeline = cur.fetchone()[1] logger.info('master_timeline=%s', master_timeline) if local_timeline > master_timeline: # Not always supported by pg_rewind need_rewind = True elif master_timeline > 1: cur.execute('TIMELINE_HISTORY %s', (master_timeline, )) history = bytes(cur.fetchone()[1]).decode('utf-8') logger.info('master: history=%s', history) else: # local_timeline == master_timeline == 1 need_rewind = False except Exception: return logger.exception( 'Exception when working with master via replication connection' ) if history is not None: for parent_timeline, switchpoint, _ in parse_history(history): if parent_timeline == local_timeline: try: need_rewind = parse_lsn(local_lsn) >= switchpoint except (IndexError, ValueError): logger.exception('Exception when parsing lsn') break elif parent_timeline > local_timeline: break self._state = need_rewind and REWIND_STATUS.NEED or REWIND_STATUS.NOT_NEED
def do_upgrade(self): from patroni.utils import polling_loop if not self.upgrade_required: logger.info( 'Current version=%s, desired version=%s. Upgrade is not required', self.cluster_version, self.desired_version) return True if not (self.postgresql.is_running() and self.postgresql.is_leader()): return logger.error('PostgreSQL is not running or in recovery') cluster = self.dcs.get_cluster() if not self.sanity_checks(cluster): return False self._old_sysid = self.postgresql.sysid # remember old sysid logger.info('Cluster %s is ready to be upgraded', self.postgresql.scope) if not self.postgresql.prepare_new_pgdata(self.desired_version): return logger.error('initdb failed') try: self.postgresql.drop_possibly_incompatible_extensions() except Exception: return logger.error( 'Failed to drop possibly incompatible extensions') if not self.postgresql.pg_upgrade(check=True): return logger.error( 'pg_upgrade --check failed, more details in the %s_upgrade', self.postgresql.data_dir) try: self.postgresql.drop_possibly_incompatible_objects() except Exception: return logger.error('Failed to drop possibly incompatible objects') logging.info('Enabling maintenance mode') if not self.toggle_pause(True): return False logger.info('Doing a clean shutdown of the cluster before pg_upgrade') downtime_start = time.time() if not self.postgresql.stop(block_callbacks=True): return logger.error('Failed to stop the cluster before pg_upgrade') if self.replica_connections: from patroni.postgresql.misc import parse_lsn # Make sure we use the pg_controldata from the correct major version self.postgresql.set_bin_dir(self.cluster_version) controldata = self.postgresql.controldata() self.postgresql.set_bin_dir(self.desired_version) checkpoint_lsn = controldata.get('Latest checkpoint location') if controldata.get('Database cluster state' ) != 'shut down' or not checkpoint_lsn: return logger.error("Cluster wasn't shut down cleanly") checkpoint_lsn = parse_lsn(checkpoint_lsn) logger.info('Latest checkpoint location: %s', checkpoint_lsn) logger.info('Starting rsyncd') self.start_rsyncd() if not self.wait_for_replicas(checkpoint_lsn): return False if not (self.rsyncd.pid and self.rsyncd.poll() is None): return logger.error('Failed to start rsyncd') if self.replica_connections: logger.info('Executing CHECKPOINT on replicas %s', ','.join(self.replica_connections.keys())) pool = ThreadPool(len(self.replica_connections)) # Do CHECKPOINT on replicas in parallel with pg_upgrade. # It will reduce the time for shutdown and so downtime. results = pool.map_async(self.checkpoint, self.replica_connections.items()) pool.close() if not self.postgresql.pg_upgrade(): return logger.error('Failed to upgrade cluster from %s to %s', self.cluster_version, self.desired_version) self.postgresql.switch_pgdata() self.upgrade_complete = True logger.info('Updating configuration files') envdir = update_configs(self.desired_version) ret = True if self.replica_connections: # Check status of replicas CHECKPOINT and remove connections that are failed. pool.join() if results.ready(): for name, status in results.get(): if not status: ret = False self.replica_connections.pop(name) member = cluster.get_member(self.postgresql.name) if self.replica_connections: primary_ip = member.conn_kwargs().get('host') rsync_start = time.time() try: if not self.rsync_replicas(primary_ip): ret = False except Exception as e: logger.error('rsync failed: %r', e) ret = False logger.info('Rsync took %s seconds', time.time() - rsync_start) self.stop_rsyncd() time.sleep(2) # Give replicas a bit of time to switch PGDATA self.remove_initialize_key() kill_patroni() self.remove_initialize_key() time.sleep(1) for _ in polling_loop(10): if self.check_patroni_api(member): break else: logger.error( 'Patroni REST API on primary is not accessible after 10 seconds' ) logger.info('Starting the primary postgres up') for _ in polling_loop(10): try: result = self.request(member, 'post', 'restart', {}) logger.info(' %s %s', result.status, result.data.decode('utf-8')) if result.status < 300: break except Exception as e: logger.error('POST /restart failed: %r', e) else: logger.error('Failed to start primary after upgrade') logger.info('Upgrade downtime: %s', time.time() - downtime_start) # The last attempt to fix initialize key race condition cluster = self.dcs.get_cluster() if cluster.initialize == self._old_sysid: self.dcs.cancel_initialization() try: self.postgresql.update_extensions() except Exception as e: logger.error('Failed to update extensions: %r', e) # start analyze early analyze_thread = Thread(target=self.analyze) analyze_thread.start() if self.replica_connections: self.wait_replicas_restart(cluster) self.resume_cluster() analyze_thread.join() self.reanalyze() logger.info('Total upgrade time (with analyze): %s', time.time() - downtime_start) self.postgresql.bootstrap.call_post_bootstrap(self.config['bootstrap']) self.postgresql.cleanup_old_pgdata() if envdir: self.start_backup(envdir) return ret