Ejemplo n.º 1
0
    def latest_checkpoint_location(self):
        """Returns checkpoint location for the cleanly shut down primary"""

        data = self.controldata()
        lsn = data.get('Latest checkpoint location')
        if data.get('Database cluster state') == 'shut down' and lsn:
            try:
                return str(parse_lsn(lsn))
            except (IndexError, ValueError) as e:
                logger.error('Exception when parsing lsn %s: %r', lsn, e)
Ejemplo n.º 2
0
    def _check_timeline_and_lsn(self, leader):
        local_timeline, local_lsn = self._get_local_timeline_lsn()
        if local_timeline is None or local_lsn is None:
            return

        if isinstance(leader, Leader):
            if leader.member.data.get('role') != 'master':
                return
        # standby cluster
        elif not self.check_leader_is_not_in_recovery(
                **leader.conn_kwargs(self._postgresql.config.replication)):
            return

        history = need_rewind = None
        try:
            with self._postgresql.get_replication_connection_cursor(
                    **leader.conn_kwargs()) as cur:
                cur.execute('IDENTIFY_SYSTEM')
                master_timeline = cur.fetchone()[1]
                logger.info('master_timeline=%s', master_timeline)
                if local_timeline > master_timeline:  # Not always supported by pg_rewind
                    need_rewind = True
                elif master_timeline > 1:
                    cur.execute('TIMELINE_HISTORY %s', (master_timeline, ))
                    history = bytes(cur.fetchone()[1]).decode('utf-8')
                    logger.info('master: history=%s', history)
                else:  # local_timeline == master_timeline == 1
                    need_rewind = False
        except Exception:
            return logger.exception(
                'Exception when working with master via replication connection'
            )

        if history is not None:
            for parent_timeline, switchpoint, _ in parse_history(history):
                if parent_timeline == local_timeline:
                    try:
                        need_rewind = parse_lsn(local_lsn) >= switchpoint
                    except (IndexError, ValueError):
                        logger.exception('Exception when parsing lsn')
                    break
                elif parent_timeline > local_timeline:
                    break

        self._state = need_rewind and REWIND_STATUS.NEED or REWIND_STATUS.NOT_NEED
Ejemplo n.º 3
0
    def do_upgrade(self):
        from patroni.utils import polling_loop

        if not self.upgrade_required:
            logger.info(
                'Current version=%s, desired version=%s. Upgrade is not required',
                self.cluster_version, self.desired_version)
            return True

        if not (self.postgresql.is_running() and self.postgresql.is_leader()):
            return logger.error('PostgreSQL is not running or in recovery')

        cluster = self.dcs.get_cluster()

        if not self.sanity_checks(cluster):
            return False

        self._old_sysid = self.postgresql.sysid  # remember old sysid

        logger.info('Cluster %s is ready to be upgraded',
                    self.postgresql.scope)
        if not self.postgresql.prepare_new_pgdata(self.desired_version):
            return logger.error('initdb failed')

        try:
            self.postgresql.drop_possibly_incompatible_extensions()
        except Exception:
            return logger.error(
                'Failed to drop possibly incompatible extensions')

        if not self.postgresql.pg_upgrade(check=True):
            return logger.error(
                'pg_upgrade --check failed, more details in the %s_upgrade',
                self.postgresql.data_dir)

        try:
            self.postgresql.drop_possibly_incompatible_objects()
        except Exception:
            return logger.error('Failed to drop possibly incompatible objects')

        logging.info('Enabling maintenance mode')
        if not self.toggle_pause(True):
            return False

        logger.info('Doing a clean shutdown of the cluster before pg_upgrade')
        downtime_start = time.time()
        if not self.postgresql.stop(block_callbacks=True):
            return logger.error('Failed to stop the cluster before pg_upgrade')

        if self.replica_connections:
            from patroni.postgresql.misc import parse_lsn

            # Make sure we use the pg_controldata from the correct major version
            self.postgresql.set_bin_dir(self.cluster_version)
            controldata = self.postgresql.controldata()
            self.postgresql.set_bin_dir(self.desired_version)

            checkpoint_lsn = controldata.get('Latest checkpoint location')
            if controldata.get('Database cluster state'
                               ) != 'shut down' or not checkpoint_lsn:
                return logger.error("Cluster wasn't shut down cleanly")

            checkpoint_lsn = parse_lsn(checkpoint_lsn)
            logger.info('Latest checkpoint location: %s', checkpoint_lsn)

            logger.info('Starting rsyncd')
            self.start_rsyncd()

            if not self.wait_for_replicas(checkpoint_lsn):
                return False

            if not (self.rsyncd.pid and self.rsyncd.poll() is None):
                return logger.error('Failed to start rsyncd')

        if self.replica_connections:
            logger.info('Executing CHECKPOINT on replicas %s',
                        ','.join(self.replica_connections.keys()))
            pool = ThreadPool(len(self.replica_connections))
            # Do CHECKPOINT on replicas in parallel with pg_upgrade.
            # It will reduce the time for shutdown and so downtime.
            results = pool.map_async(self.checkpoint,
                                     self.replica_connections.items())
            pool.close()

        if not self.postgresql.pg_upgrade():
            return logger.error('Failed to upgrade cluster from %s to %s',
                                self.cluster_version, self.desired_version)

        self.postgresql.switch_pgdata()
        self.upgrade_complete = True

        logger.info('Updating configuration files')
        envdir = update_configs(self.desired_version)

        ret = True
        if self.replica_connections:
            # Check status of replicas CHECKPOINT and remove connections that are failed.
            pool.join()
            if results.ready():
                for name, status in results.get():
                    if not status:
                        ret = False
                        self.replica_connections.pop(name)

        member = cluster.get_member(self.postgresql.name)
        if self.replica_connections:
            primary_ip = member.conn_kwargs().get('host')
            rsync_start = time.time()
            try:
                if not self.rsync_replicas(primary_ip):
                    ret = False
            except Exception as e:
                logger.error('rsync failed: %r', e)
                ret = False
            logger.info('Rsync took %s seconds', time.time() - rsync_start)

            self.stop_rsyncd()
            time.sleep(2)  # Give replicas a bit of time to switch PGDATA

        self.remove_initialize_key()
        kill_patroni()
        self.remove_initialize_key()

        time.sleep(1)
        for _ in polling_loop(10):
            if self.check_patroni_api(member):
                break
        else:
            logger.error(
                'Patroni REST API on primary is not accessible after 10 seconds'
            )

        logger.info('Starting the primary postgres up')
        for _ in polling_loop(10):
            try:
                result = self.request(member, 'post', 'restart', {})
                logger.info('   %s %s', result.status,
                            result.data.decode('utf-8'))
                if result.status < 300:
                    break
            except Exception as e:
                logger.error('POST /restart failed: %r', e)
        else:
            logger.error('Failed to start primary after upgrade')

        logger.info('Upgrade downtime: %s', time.time() - downtime_start)

        # The last attempt to fix initialize key race condition
        cluster = self.dcs.get_cluster()
        if cluster.initialize == self._old_sysid:
            self.dcs.cancel_initialization()

        try:
            self.postgresql.update_extensions()
        except Exception as e:
            logger.error('Failed to update extensions: %r', e)

        # start analyze early
        analyze_thread = Thread(target=self.analyze)
        analyze_thread.start()

        if self.replica_connections:
            self.wait_replicas_restart(cluster)

        self.resume_cluster()

        analyze_thread.join()

        self.reanalyze()

        logger.info('Total upgrade time (with analyze): %s',
                    time.time() - downtime_start)
        self.postgresql.bootstrap.call_post_bootstrap(self.config['bootstrap'])
        self.postgresql.cleanup_old_pgdata()

        if envdir:
            self.start_backup(envdir)

        return ret