Ejemplo n.º 1
0
    def wait_for_replicas(self, checkpoint_lsn):
        from patroni.utils import polling_loop

        logger.info('Waiting for replica nodes to catch up with primary')

        query = (
            "SELECT pg_catalog.pg_{0}_{1}_diff(pg_catalog.pg_last_{0}_replay_{1}(),"
            " '0/0')::bigint").format(self.postgresql.wal_name,
                                      self.postgresql.lsn_name)

        status = {}

        for _ in polling_loop(60):
            synced = True
            for name, (_, cur) in self.replica_connections.items():
                prev = status.get(name)
                if prev and prev >= checkpoint_lsn:
                    continue

                cur.execute(query)
                lsn = cur.fetchone()[0]
                status[name] = lsn

                if lsn < checkpoint_lsn:
                    synced = False

            if synced:
                logger.info('All replicas are ready')
                return True

        for name in self.replica_connections.keys():
            lsn = status.get(name)
            if not lsn or lsn < checkpoint_lsn:
                logger.error('Node %s did not catched up. Lag=%s', name,
                             checkpoint_lsn - lsn)
Ejemplo n.º 2
0
def main():
    from patroni.config import Config
    from patroni.utils import polling_loop
    from pg_upgrade import PostgresqlUpgrade

    config = Config()
    upgrade = PostgresqlUpgrade(config['postgresql'])

    bin_version = upgrade.get_binary_version()
    cluster_version = upgrade.get_cluster_version()

    if cluster_version == bin_version:
        return 0

    logger.info('Cluster version: %s, bin version: %s', cluster_version, bin_version)
    assert float(cluster_version) < float(bin_version)

    upgrade.config['pg_ctl_timeout'] = 3600*24*7

    logger.info('Trying to start the cluster with old postgres')
    if not upgrade.start_old_cluster(config['bootstrap'], cluster_version):
        raise Exception('Failed to start the cluster with old postgres')

    for _ in polling_loop(upgrade.config['pg_ctl_timeout'], 10):
        upgrade.reset_cluster_info_state()
        if upgrade.is_leader():
            break
        logger.info('waiting for end of recovery of the old cluster')

    if not upgrade.run_bootstrap_post_init(config['bootstrap']):
        upgrade.stop(block_callbacks=True, checkpoint=False)
        raise Exception('Failed to run bootstrap.post_init')

    locale = upgrade.query('SHOW lc_collate').fetchone()[0]
    encoding = upgrade.query('SHOW server_encoding').fetchone()[0]
    initdb_config = [{'locale': locale}, {'encoding': encoding}]
    if upgrade.query("SELECT current_setting('data_checksums')::bool").fetchone()[0]:
        initdb_config.append('data-checksums')

    logger.info('Dropping objects from the cluster which could be incompatible')
    try:
        upgrade.drop_possibly_incompatible_objects()
    except Exception:
        upgrade.stop(block_callbacks=True, checkpoint=False)
        raise

    logger.info('Doing a clean shutdown of the cluster before pg_upgrade')
    if not upgrade.stop(block_callbacks=True, checkpoint=False):
        raise Exception('Failed to stop the cluster with old postgres')

    logger.info('initdb config: %s', initdb_config)

    logger.info('Executing pg_upgrade')
    if not upgrade.do_upgrade(bin_version, {'initdb': initdb_config}):
        raise Exception('Failed to upgrade cluster from {0} to {1}'.format(cluster_version, bin_version))

    logger.info('Starting the cluster with new postgres after upgrade')
    if not upgrade.start():
        raise Exception('Failed to start the cluster with new postgres')
    upgrade.analyze()
Ejemplo n.º 3
0
def wait_until_pause_is_applied(dcs, paused, old_cluster):
    click.echo(
        "'{0}' request sent, waiting until it is recognized by all nodes".
        format(paused and 'pause' or 'resume'))
    old = {m.name: m.index for m in old_cluster.members if m.api_url}
    loop_wait = old_cluster.config.data.get('loop_wait', dcs.loop_wait)

    for _ in polling_loop(loop_wait + 1):
        cluster = dcs.get_cluster()
        if all(
                m.data.get('pause', False) == paused for m in cluster.members
                if m.name in old):
            break
    else:
        remaining = [
            m.name for m in cluster.members
            if m.data.get('pause', False) != paused and m.name in old
            and old[m.name] != m.index
        ]
        if remaining:
            return click.echo(
                "{0} members didn't recognized pause state after {1} seconds".
                format(', '.join(remaining), loop_wait))
    return click.echo('Success: cluster management is {0}'.format(
        paused and 'paused' or 'resumed'))
Ejemplo n.º 4
0
def main():
    from pg_upgrade import PostgresqlUpgrade
    from patroni.config import Config
    from patroni.utils import polling_loop
    from spilo_commons import get_binary_version

    config = Config(sys.argv[1])
    upgrade = PostgresqlUpgrade(config)

    bin_version = get_binary_version(upgrade.pgcommand(''))
    cluster_version = upgrade.get_cluster_version()

    if cluster_version == bin_version:
        return 0

    logger.info('Cluster version: %s, bin version: %s', cluster_version,
                bin_version)
    assert float(cluster_version) < float(bin_version)

    logger.info('Trying to start the cluster with old postgres')
    if not upgrade.start_old_cluster(config['bootstrap'], cluster_version):
        raise Exception('Failed to start the cluster with old postgres')

    for _ in polling_loop(upgrade.config.get('pg_ctl_timeout'), 10):
        upgrade.reset_cluster_info_state()
        if upgrade.is_leader():
            break
        logger.info('waiting for end of recovery of the old cluster')

    if not upgrade.bootstrap.call_post_bootstrap(config['bootstrap']):
        upgrade.stop(block_callbacks=True, checkpoint=False)
        raise Exception('Failed to run bootstrap.post_init')

    if not upgrade.prepare_new_pgdata(bin_version):
        raise Exception('initdb failed')

    try:
        upgrade.drop_possibly_incompatible_objects()
    except Exception:
        upgrade.stop(block_callbacks=True, checkpoint=False)
        raise

    logger.info('Doing a clean shutdown of the cluster before pg_upgrade')
    if not upgrade.stop(block_callbacks=True, checkpoint=False):
        raise Exception('Failed to stop the cluster with old postgres')

    if not upgrade.do_upgrade():
        raise Exception('Failed to upgrade cluster from {0} to {1}'.format(
            cluster_version, bin_version))

    logger.info('Starting the cluster with new postgres after upgrade')
    if not upgrade.start():
        raise Exception('Failed to start the cluster with new postgres')

    try:
        upgrade.update_extensions()
    except Exception as e:
        logger.error('Failed to update extensions: %r', e)

    upgrade.analyze()
Ejemplo n.º 5
0
def wait_end_of_recovery(postgresql):
    from patroni.utils import polling_loop

    for _ in polling_loop(postgresql.config.get('pg_ctl_timeout'), 10):
        postgresql.reset_cluster_info_state()
        if postgresql.is_leader():
            break
        logger.info('waiting for end of recovery of the old cluster')
Ejemplo n.º 6
0
def main():
    from patroni.config import Config
    from patroni.utils import polling_loop
    from pg_upgrade import PostgresqlUpgrade

    config = Config()
    upgrade = PostgresqlUpgrade(config['postgresql'])

    bin_version = upgrade.get_binary_version()
    cluster_version = upgrade.get_cluster_version()

    if cluster_version == bin_version:
        return 0

    logger.info('Cluster version: %s, bin version: %s', cluster_version,
                bin_version)
    assert float(cluster_version) < float(bin_version)

    upgrade.set_bin_dir(cluster_version)
    upgrade.config['pg_ctl_timeout'] = 3600 * 24 * 7
    upgrade.config['callbacks'] = {}

    bootstrap_config = config['bootstrap']
    bootstrap_config[bootstrap_config['method']]['command'] = 'true'
    logger.info('Trying to start the cluster with old postgres')
    if not upgrade.bootstrap(bootstrap_config):
        raise Exception('Failed to start the cluster with old postgres')

    for _ in polling_loop(upgrade.config['pg_ctl_timeout'], 10):
        upgrade.reset_cluster_info_state()
        if upgrade.is_leader():
            break
        logger.info('waiting for end of recovery of the old cluster')

    if not upgrade.run_bootstrap_post_init(bootstrap_config):
        raise Exception('Failed to run bootstrap.post_init')

    locale = upgrade.query('SHOW lc_collate').fetchone()[0]
    encoding = upgrade.query('SHOW server_encoding').fetchone()[0]
    initdb_config = [{'locale': locale}, {'encoding': encoding}]
    if upgrade.query('SHOW data_checksums').fetchone()[0]:
        initdb_config.append('data-checksums')

    logger.info('Doing a clean shutdown of the cluster before pg_upgrade')
    if not upgrade.stop(block_callbacks=True, checkpoint=False):
        raise Exception('Failed to stop the cluster with old postgres')

    logger.info('initdb config: %s', initdb_config)

    logger.info('Executing pg_upgrade')
    if not upgrade.do_upgrade(bin_version, {'initdb': initdb_config}):
        raise Exception('Failed to upgrade cluster from {0} to {1}'.format(
            cluster_version, bin_version))

    logger.info('Starting the cluster with new postgres after upgrade')
    if not upgrade.start():
        raise Exception('Failed to start the cluster with new postgres')
    upgrade.analyze()
Ejemplo n.º 7
0
    def remove_initialize_key(self):
        from patroni.utils import polling_loop

        for _ in polling_loop(10):
            cluster = self.dcs.get_cluster()
            if cluster.initialize is None:
                return True
            logging.info('Removing initialize key')
            if self.dcs.cancel_initialization():
                return True
        logger.error('Failed to remove initialize key')
Ejemplo n.º 8
0
    def cancel(self):
        with self._lock:
            self._is_cancelled = True
            if self._process is None or not self._process.is_running():
                return
            self._process.terminate()

        for _ in polling_loop(10):
            with self._lock:
                if self._process is None or not self._process.is_running():
                    return

        self._kill_process()
Ejemplo n.º 9
0
    def cancel(self):
        with self._lock:
            self._is_cancelled = True
            if self._process is None or self._process.returncode is not None:
                return
            self._process.terminate()

        for _ in polling_loop(10):
            with self._lock:
                if self._process is None or self._process.returncode is not None:
                    return

        with self._lock:
            if self._process is not None and self._process.returncode is None:
                self._process.kill()
Ejemplo n.º 10
0
    def wait_replica_restart(self, member):
        from patroni.utils import polling_loop

        for _ in polling_loop(10):
            try:
                response = self.request(member, timeout=2, retries=0)
                if response.status == 200:
                    data = json.loads(response.data.decode('utf-8'))
                    database_system_identifier = data.get(
                        'database_system_identifier')
                    if database_system_identifier and database_system_identifier != self._old_sysid:
                        return member.name
            except Exception:
                pass
        logger.error('Patroni on replica %s was not restarted in 10 seconds',
                     member.name)
Ejemplo n.º 11
0
def wait_until_pause_is_applied(dcs, paused, old_cluster):
    click.echo("'{0}' request sent, waiting until it is recognized by all nodes".format(paused and 'pause' or 'resume'))
    old = {m.name: m.index for m in old_cluster.members if m.api_url}
    loop_wait = old_cluster.config.data.get('loop_wait', dcs.loop_wait)

    for _ in polling_loop(loop_wait + 1):
        cluster = dcs.get_cluster()
        if all(m.data.get('pause', False) == paused for m in cluster.members if m.name in old):
            break
    else:
        remaining = [m.name for m in cluster.members if m.data.get('pause', False) != paused
                     and m.name in old and old[m.name] != m.index]
        if remaining:
            return click.echo("{0} members didn't recognized pause state after {1} seconds"
                              .format(', '.join(remaining), loop_wait))
    return click.echo('Success: cluster management is {0}'.format(paused and 'paused' or 'resumed'))
Ejemplo n.º 12
0
    def cancel(self, kill=False):
        with self._lock:
            self._is_cancelled = True
            if self._process is None or not self._process.is_running():
                return

            logger.info('Terminating %s', self._process_cmd)
            self._process.terminate()

        for _ in polling_loop(10):
            with self._lock:
                if self._process is None or not self._process.is_running():
                    return
            if kill:
                break

        self._kill_process()
Ejemplo n.º 13
0
Archivo: ha.py Proyecto: vanife/patroni
    def while_not_sync_standby(self, func):
        """Runs specified action while trying to make sure that the node is not assigned synchronous standby status.

        Tags us as not allowed to be a sync standby as we are going to go away, if we currently are wait for
        leader to notice and pick an alternative one or if the leader changes or goes away we are also free.

        If the connection to DCS fails we run the action anyway, as this is only a hint.

        There is a small race window where this function runs between a master picking us the sync standby and
        publishing it to the DCS. As the window is rather tiny consequences are holding up commits for one cycle
        period we don't worry about it here."""

        if not self.is_synchronous_mode() or self.patroni.nosync:
            return func()

        with self._member_state_lock:
            self._disable_sync += 1
        try:
            if self.touch_member():
                # Master should notice the updated value during the next cycle. We will wait double that, if master
                # hasn't noticed the value by then not disabling sync replication is not likely to matter.
                for _ in polling_loop(timeout=self.dcs.loop_wait * 2,
                                      interval=2):
                    try:
                        if not self.is_sync_standby(self.dcs.get_cluster()):
                            break
                    except DCSError:
                        logger.warning(
                            "Could not get cluster state, skipping synchronous standby disable"
                        )
                        break
                    logger.info(
                        "Waiting for master to release us from synchronous standby"
                    )
            else:
                logger.warning(
                    "Updating member state failed, skipping synchronous standby disable"
                )

            return func()
        finally:
            with self._member_state_lock:
                self._disable_sync -= 1
Ejemplo n.º 14
0
    def wait_for_port_open(self, postmaster, timeout):
        """Waits until PostgreSQL opens ports."""
        for _ in polling_loop(timeout):
            if self.cancellable.is_cancelled:
                return False

            if not postmaster.is_running():
                logger.error('postmaster is not running')
                self.set_state('start failed')
                return False

            isready = self.pg_isready()
            if isready != STATE_NO_RESPONSE:
                if isready not in [STATE_REJECT, STATE_RUNNING]:
                    logger.warning("Can't determine PostgreSQL startup status, assuming running")
                return True

        logger.warning("Timed out waiting for PostgreSQL to start")
        return False
Ejemplo n.º 15
0
    def toggle_pause(self, paused):
        from patroni.utils import polling_loop

        cluster = self.dcs.get_cluster()
        config = cluster.config.data.copy()
        if cluster.is_paused() == paused:
            return logger.error('Cluster is %spaused, can not continue',
                                ('' if paused else 'not '))

        config['pause'] = paused
        if not self.dcs.set_config_value(
                json.dumps(config, separators=(',', ':')),
                cluster.config.index):
            return logger.error('Failed to pause cluster, can not continue')

        self.paused = paused

        old = {m.name: m.index for m in cluster.members if m.api_url}
        ttl = cluster.config.data.get('ttl', self.dcs.ttl)
        for _ in polling_loop(ttl + 1):
            cluster = self.dcs.get_cluster()
            if all(
                    m.data.get('pause', False) == paused
                    for m in cluster.members if m.name in old):
                logger.info('Maintenance mode %s',
                            ('enabled' if paused else 'disabled'))
                return True

        remaining = [
            m.name for m in cluster.members
            if m.data.get('pause', False) != paused and m.name in old
            and old[m.name] != m.index
        ]
        if remaining:
            return logger.error(
                "%s members didn't recognized pause state after %s seconds",
                remaining, ttl)
Ejemplo n.º 16
0
    def while_not_sync_standby(self, func):
        """Runs specified action while trying to make sure that the node is not assigned synchronous standby status.

        Tags us as not allowed to be a sync standby as we are going to go away, if we currently are wait for
        leader to notice and pick an alternative one or if the leader changes or goes away we are also free.

        If the connection to DCS fails we run the action anyway, as this is only a hint.

        There is a small race window where this function runs between a master picking us the sync standby and
        publishing it to the DCS. As the window is rather tiny consequences are holding up commits for one cycle
        period we don't worry about it here."""

        if not self.is_synchronous_mode() or self.patroni.nosync:
            return func()

        with self._member_state_lock:
            self._disable_sync += 1
        try:
            if self.touch_member():
                # Master should notice the updated value during the next cycle. We will wait double that, if master
                # hasn't noticed the value by then not disabling sync replication is not likely to matter.
                for _ in polling_loop(timeout=self.dcs.loop_wait * 2, interval=2):
                    try:
                        if not self.is_sync_standby(self.dcs.get_cluster()):
                            break
                    except DCSError:
                        logger.warning("Could not get cluster state, skipping synchronous standby disable")
                        break
                    logger.info("Waiting for master to release us from synchronous standby")
            else:
                logger.warning("Updating member state failed, skipping synchronous standby disable")

            return func()
        finally:
            with self._member_state_lock:
                self._disable_sync -= 1
Ejemplo n.º 17
0
def rsync_replica(config, desired_version, primary_ip, pid):
    from pg_upgrade import PostgresqlUpgrade
    from patroni.utils import polling_loop

    me = psutil.Process()

    # check that we are the child of postgres backend
    if me.parent().pid != pid and me.parent().parent().pid != pid:
        return 1

    backend = psutil.Process(pid)
    if 'postgres' not in backend.name():
        return 1

    postgresql = PostgresqlUpgrade(config)

    if postgresql.get_cluster_version() == desired_version:
        return 0

    if os.fork():
        return 0

    # Wait until the remote side will close the connection and backend process exits
    for _ in polling_loop(10):
        if not backend.is_running():
            break
    else:
        logger.warning('Backend did not exit after 10 seconds')

    sysid = postgresql.sysid  # remember old sysid

    if not postgresql.stop(block_callbacks=True):
        logger.error('Failed to stop the cluster before rsync')
        return 1

    postgresql.switch_pgdata()

    update_configs(desired_version)

    env = os.environ.copy()
    env['RSYNC_PASSWORD'] = postgresql.config.replication['password']
    if subprocess.call([
            'rsync', '--archive', '--delete', '--hard-links', '--size-only',
            '--omit-dir-times', '--no-inc-recursive', '--include=/data/***',
            '--include=/data_old/***', '--exclude=/data/pg_xlog/*',
            '--exclude=/data_old/pg_xlog/*', '--exclude=/data/pg_wal/*',
            '--exclude=/data_old/pg_wal/*',
            '--exclude=*', 'rsync://{0}@{1}:{2}/pgroot'.format(
                postgresql.name, primary_ip, RSYNC_PORT),
            os.path.dirname(postgresql.data_dir)
    ],
                       env=env) != 0:
        logger.error('Failed to rsync from %s', primary_ip)
        postgresql.switch_back_pgdata()
        # XXX: rollback configs?
        return 1

    conn_kwargs = {
        k: v
        for k, v in postgresql.config.replication.items() if v is not None
    }
    if 'username' in conn_kwargs:
        conn_kwargs['user'] = conn_kwargs.pop('username')

    # If restart Patroni right now there is a chance that it will exit due to the sysid mismatch.
    # Due to cleaned environment we can't always use DCS on replicas in this script, therefore
    # the good indicator of initialize key being deleted/updated is running primary after the upgrade.
    for _ in polling_loop(300):
        try:
            with postgresql.get_replication_connection_cursor(
                    primary_ip, **conn_kwargs) as cur:
                cur.execute('IDENTIFY_SYSTEM')
                if cur.fetchone()[0] != sysid:
                    break
        except Exception:
            pass

    # If the cluster was unpaused earlier than we restarted Patroni, it might have created
    # the recovery.conf file and tried (and failed) to start the cluster up using wrong binaries.
    # In case of upgrade to 12+ presence of PGDATA/recovery.conf will not allow postgres to start.
    # We remove the recovery.conf and restart Patroni in order to make sure it is using correct config.
    try:
        postgresql.config.remove_recovery_conf()
    except Exception:
        pass
    kill_patroni()
    try:
        postgresql.config.remove_recovery_conf()
    except Exception:
        pass

    return postgresql.cleanup_old_pgdata()
Ejemplo n.º 18
0
    def do_upgrade(self):
        from patroni.utils import polling_loop

        if not self.upgrade_required:
            logger.info(
                'Current version=%s, desired version=%s. Upgrade is not required',
                self.cluster_version, self.desired_version)
            return True

        if not (self.postgresql.is_running() and self.postgresql.is_leader()):
            return logger.error('PostgreSQL is not running or in recovery')

        cluster = self.dcs.get_cluster()

        if not self.sanity_checks(cluster):
            return False

        self._old_sysid = self.postgresql.sysid  # remember old sysid

        logger.info('Cluster %s is ready to be upgraded',
                    self.postgresql.scope)
        if not self.postgresql.prepare_new_pgdata(self.desired_version):
            return logger.error('initdb failed')

        try:
            self.postgresql.drop_possibly_incompatible_extensions()
        except Exception:
            return logger.error(
                'Failed to drop possibly incompatible extensions')

        if not self.postgresql.pg_upgrade(check=True):
            return logger.error(
                'pg_upgrade --check failed, more details in the %s_upgrade',
                self.postgresql.data_dir)

        try:
            self.postgresql.drop_possibly_incompatible_objects()
        except Exception:
            return logger.error('Failed to drop possibly incompatible objects')

        logging.info('Enabling maintenance mode')
        if not self.toggle_pause(True):
            return False

        logger.info('Doing a clean shutdown of the cluster before pg_upgrade')
        downtime_start = time.time()
        if not self.postgresql.stop(block_callbacks=True):
            return logger.error('Failed to stop the cluster before pg_upgrade')

        if self.replica_connections:
            from patroni.postgresql.misc import parse_lsn

            # Make sure we use the pg_controldata from the correct major version
            self.postgresql.set_bin_dir(self.cluster_version)
            controldata = self.postgresql.controldata()
            self.postgresql.set_bin_dir(self.desired_version)

            checkpoint_lsn = controldata.get('Latest checkpoint location')
            if controldata.get('Database cluster state'
                               ) != 'shut down' or not checkpoint_lsn:
                return logger.error("Cluster wasn't shut down cleanly")

            checkpoint_lsn = parse_lsn(checkpoint_lsn)
            logger.info('Latest checkpoint location: %s', checkpoint_lsn)

            logger.info('Starting rsyncd')
            self.start_rsyncd()

            if not self.wait_for_replicas(checkpoint_lsn):
                return False

            if not (self.rsyncd.pid and self.rsyncd.poll() is None):
                return logger.error('Failed to start rsyncd')

        if self.replica_connections:
            logger.info('Executing CHECKPOINT on replicas %s',
                        ','.join(self.replica_connections.keys()))
            pool = ThreadPool(len(self.replica_connections))
            # Do CHECKPOINT on replicas in parallel with pg_upgrade.
            # It will reduce the time for shutdown and so downtime.
            results = pool.map_async(self.checkpoint,
                                     self.replica_connections.items())
            pool.close()

        if not self.postgresql.pg_upgrade():
            return logger.error('Failed to upgrade cluster from %s to %s',
                                self.cluster_version, self.desired_version)

        self.postgresql.switch_pgdata()
        self.upgrade_complete = True

        logger.info('Updating configuration files')
        envdir = update_configs(self.desired_version)

        ret = True
        if self.replica_connections:
            # Check status of replicas CHECKPOINT and remove connections that are failed.
            pool.join()
            if results.ready():
                for name, status in results.get():
                    if not status:
                        ret = False
                        self.replica_connections.pop(name)

        member = cluster.get_member(self.postgresql.name)
        if self.replica_connections:
            primary_ip = member.conn_kwargs().get('host')
            rsync_start = time.time()
            try:
                if not self.rsync_replicas(primary_ip):
                    ret = False
            except Exception as e:
                logger.error('rsync failed: %r', e)
                ret = False
            logger.info('Rsync took %s seconds', time.time() - rsync_start)

            self.stop_rsyncd()
            time.sleep(2)  # Give replicas a bit of time to switch PGDATA

        self.remove_initialize_key()
        kill_patroni()
        self.remove_initialize_key()

        time.sleep(1)
        for _ in polling_loop(10):
            if self.check_patroni_api(member):
                break
        else:
            logger.error(
                'Patroni REST API on primary is not accessible after 10 seconds'
            )

        logger.info('Starting the primary postgres up')
        for _ in polling_loop(10):
            try:
                result = self.request(member, 'post', 'restart', {})
                logger.info('   %s %s', result.status,
                            result.data.decode('utf-8'))
                if result.status < 300:
                    break
            except Exception as e:
                logger.error('POST /restart failed: %r', e)
        else:
            logger.error('Failed to start primary after upgrade')

        logger.info('Upgrade downtime: %s', time.time() - downtime_start)

        # The last attempt to fix initialize key race condition
        cluster = self.dcs.get_cluster()
        if cluster.initialize == self._old_sysid:
            self.dcs.cancel_initialization()

        try:
            self.postgresql.update_extensions()
        except Exception as e:
            logger.error('Failed to update extensions: %r', e)

        # start analyze early
        analyze_thread = Thread(target=self.analyze)
        analyze_thread.start()

        if self.replica_connections:
            self.wait_replicas_restart(cluster)

        self.resume_cluster()

        analyze_thread.join()

        self.reanalyze()

        logger.info('Total upgrade time (with analyze): %s',
                    time.time() - downtime_start)
        self.postgresql.bootstrap.call_post_bootstrap(self.config['bootstrap'])
        self.postgresql.cleanup_old_pgdata()

        if envdir:
            self.start_backup(envdir)

        return ret
Ejemplo n.º 19
0
    def rsync_replicas(self, primary_ip):
        from patroni.utils import polling_loop

        logger.info('Notifying replicas %s to start rsync',
                    ','.join(self.replica_connections.keys()))
        ret = True
        status = {}
        for name, (ip, cur) in self.replica_connections.items():
            try:
                cur.execute("SELECT pg_catalog.pg_backend_pid()")
                pid = cur.fetchone()[0]
                # We use the COPY TO PROGRAM "hack" to start the rsync on replicas.
                # There are a few important moments:
                # 1. The script is started as a child process of postgres backend, which
                #    is running with the clean environment. I.e., the script will not see
                #    values of PGVERSION, SPILO_CONFIGURATION, KUBERNETES_SERVICE_HOST
                # 2. Since access to the DCS might not be possible with pass the primary_ip
                # 3. The desired_version passed explicitly to guaranty 100% match with the master
                # 4. In order to protect from the accidental "rsync" we pass the pid of postgres backend.
                #    The script will check that it is the child of the very specific postgres process.
                cur.execute(
                    "COPY (SELECT) TO PROGRAM 'nohup {0} /scripts/inplace_upgrade.py {1} {2} {3}'"
                    .format(sys.executable, self.desired_version, primary_ip,
                            pid))
                conn = cur.connection
                cur.close()
                conn.close()
            except Exception as e:
                logger.error('COPY TO PROGRAM on %s failed: %r', name, e)
                status[name] = False
                ret = False

        for name in status.keys():
            self.replica_connections.pop(name)

        logger.info('Waiting for replicas rsync to complete')
        status.clear()
        for _ in polling_loop(300):
            synced = True
            for name in self.replica_connections.keys():
                feedback = os.path.join(self.rsyncd_feedback_dir, name)
                if name not in status and os.path.exists(feedback):
                    with open(feedback) as f:
                        status[name] = f.read().strip()

                if name not in status:
                    synced = False
            if synced:
                break

        for name in self.replica_connections.keys():
            result = status.get(name)
            if result is None:
                logger.error(
                    'Did not received rsync feedback from %s after 300 seconds',
                    name)
                ret = False
            elif not result.startswith('0'):
                logger.error('Rsync on %s finished with code %s', name, result)
                ret = False
        return ret
Ejemplo n.º 20
0
def main():
    from patroni.config import Config
    from patroni.utils import polling_loop
    from pg_upgrade import PostgresqlUpgrade

    config = Config()
    config['postgresql'].update({
        'callbacks': {},
        'pg_ctl_timeout': 3600 * 24 * 7
    })
    upgrade = PostgresqlUpgrade(config['postgresql'])

    bin_version = upgrade.get_binary_version()
    cluster_version = upgrade.get_cluster_version()

    if cluster_version == bin_version:
        return 0

    logger.info('Cluster version: %s, bin version: %s', cluster_version,
                bin_version)
    assert float(cluster_version) < float(bin_version)

    logger.info('Trying to start the cluster with old postgres')
    if not upgrade.start_old_cluster(config['bootstrap'], cluster_version):
        raise Exception('Failed to start the cluster with old postgres')

    for _ in polling_loop(upgrade.config.get('pg_ctl_timeout'), 10):
        upgrade.reset_cluster_info_state()
        if upgrade.is_leader():
            break
        logger.info('waiting for end of recovery of the old cluster')

    if not upgrade.bootstrap.call_post_bootstrap(config['bootstrap']):
        upgrade.stop(block_callbacks=True, checkpoint=False)
        raise Exception('Failed to run bootstrap.post_init')

    locale = upgrade.query('SHOW lc_collate').fetchone()[0]
    encoding = upgrade.query('SHOW server_encoding').fetchone()[0]
    initdb_config = [{'locale': locale}, {'encoding': encoding}]
    if upgrade.query(
            "SELECT current_setting('data_checksums')::bool").fetchone()[0]:
        initdb_config.append('data-checksums')

    logger.info(
        'Dropping objects from the cluster which could be incompatible')
    try:
        upgrade.drop_possibly_incompatible_objects()
    except Exception:
        upgrade.stop(block_callbacks=True, checkpoint=False)
        raise

    logger.info('Doing a clean shutdown of the cluster before pg_upgrade')
    if not upgrade.stop(block_callbacks=True, checkpoint=False):
        raise Exception('Failed to stop the cluster with old postgres')

    logger.info('initdb config: %s', initdb_config)

    logger.info('Executing pg_upgrade')
    if not upgrade.do_upgrade(bin_version, initdb_config):
        raise Exception('Failed to upgrade cluster from {0} to {1}'.format(
            cluster_version, bin_version))

    logger.info('Starting the cluster with new postgres after upgrade')
    if not upgrade.start():
        raise Exception('Failed to start the cluster with new postgres')
    upgrade.analyze()
Ejemplo n.º 21
0
    def do_upgrade(self):
        from patroni.utils import polling_loop

        if not self.upgrade_required:
            logger.info(
                'Current version=%s, desired version=%s. Upgrade is not required',
                self.cluster_version, self.desired_version)
            return True

        if not (self.postgresql.is_running() and self.postgresql.is_leader()):
            return logger.error('PostgreSQL is not running or in recovery')

        cluster = self.dcs.get_cluster()

        if not self.sanity_checks(cluster):
            return False

        self._old_sysid = self.postgresql.sysid  # remember old sysid

        logger.info('Cluster %s is ready to be upgraded',
                    self.postgresql.scope)
        if not self.postgresql.prepare_new_pgdata(self.desired_version):
            return logger.error('initdb failed')

        if not self.postgresql.pg_upgrade(check=True):
            return logger.error(
                'pg_upgrade --check failed, more details in the %s_upgrade',
                self.postgresql.data_dir)

        try:
            self.postgresql.drop_possibly_incompatible_objects()
        except Exception:
            return logger.error('Failed to drop possibly incompatible objects')

        logging.info('Enabling maintenance mode')
        if not self.toggle_pause(True):
            return False

        logger.info('Doing a clean shutdown of the cluster before pg_upgrade')
        downtime_start = time.time()
        if not self.postgresql.stop(block_callbacks=True):
            return logger.error('Failed to stop the cluster before pg_upgrade')

        if self.replica_connections:
            checkpoint_lsn = int(self.postgresql.latest_checkpoint_location())
            logger.info('Latest checkpoint location: %s', checkpoint_lsn)

            logger.info('Starting rsyncd')
            self.start_rsyncd()

            if not self.wait_for_replicas(checkpoint_lsn):
                return False

            if not (self.rsyncd.pid and self.rsyncd.poll() is None):
                return logger.error('Failed to start rsyncd')

        if not self.postgresql.pg_upgrade():
            return logger.error('Failed to upgrade cluster from %s to %s',
                                self.cluster_version, self.desired_version)

        self.postgresql.switch_pgdata()
        self.upgrade_complete = True

        logger.info('Updating configuration files')
        envdir = update_configs(self.desired_version)

        member = cluster.get_member(self.postgresql.name)
        if self.replica_connections:
            primary_ip = member.conn_kwargs().get('host')
            rsync_start = time.time()
            try:
                ret = self.rsync_replicas(primary_ip)
            except Exception as e:
                logger.error('rsync failed: %r', e)
                ret = False
            logger.info('Rsync took %s seconds', time.time() - rsync_start)

            self.stop_rsyncd()
            time.sleep(2)  # Give replicas a bit of time to switch PGDATA

        self.remove_initialize_key()
        kill_patroni()
        self.remove_initialize_key()

        time.sleep(1)
        for _ in polling_loop(10):
            if self.check_patroni_api(member):
                break
        else:
            logger.error(
                'Patroni REST API on primary is not accessible after 10 seconds'
            )

        logger.info('Starting the primary postgres up')
        for _ in polling_loop(10):
            try:
                result = self.request(member, 'post', 'restart', {})
                logger.info('   %s %s', result.status,
                            result.data.decode('utf-8'))
                if result.status < 300:
                    break
            except Exception as e:
                logger.error('POST /restart failed: %r', e)
        else:
            logger.error('Failed to start primary after upgrade')

        logger.info('Upgrade downtime: %s', time.time() - downtime_start)

        try:
            self.postgresql.update_extensions()
        except Exception as e:
            logger.error('Failed to update extensions: %r', e)

        # start analyze early
        analyze_thread = Thread(target=self.analyze)
        analyze_thread.start()

        self.wait_replicas_restart(cluster)

        self.resume_cluster()

        analyze_thread.join()

        self.reanalyze()

        logger.info('Total upgrade time (with analyze): %s',
                    time.time() - downtime_start)
        self.postgresql.bootstrap.call_post_bootstrap(self.config['bootstrap'])
        self.postgresql.cleanup_old_pgdata()

        if envdir:
            self.start_backup(envdir)

        return ret
Ejemplo n.º 22
0
 def _wait_promote(self, wait_seconds):
     for _ in polling_loop(wait_seconds):
         data = self.controldata()
         if data.get('Database cluster state') == 'in production':
             return True
Ejemplo n.º 23
0
 def test_polling_loop(self):
     self.assertEquals(list(polling_loop(0.001, interval=0.001)), [0])