コード例 #1
0
ファイル: ha.py プロジェクト: alkersan/patroni
    def should_run_scheduled_action(self, action_name, scheduled_at, cleanup_fn):
        if scheduled_at:
            # If the scheduled action is in the far future, we shouldn't do anything and just return.
            # If the scheduled action is in the past, we consider the value to be stale and we remove
            # the value.
            # If the value is close to now, we initiate the scheduled action
            # Additionally, if the scheduled action cannot be executed altogether, i.e. there is an error
            # or the action is in the past - we take care of cleaning it up.
            now = datetime.datetime.now(pytz.utc)
            try:
                delta = (scheduled_at - now).total_seconds()

                if delta > self.dcs.loop_wait:
                    logger.info('Awaiting %s at %s (in %.0f seconds)',
                                action_name, scheduled_at.isoformat(), delta)
                    return False
                elif delta < - int(self.dcs.loop_wait * 1.5):
                    logger.warning('Found a stale %s value, cleaning up: %s',
                                   action_name, scheduled_at.isoformat())
                    cleanup_fn()
                    self.dcs.manual_failover('', '', index=self.cluster.failover.index)
                    return False

                # The value is very close to now
                sleep(max(delta, 0))
                logger.info('Manual scheduled {0} at %s'.format(action_name), scheduled_at.isoformat())
                return True
            except TypeError:
                logger.warning('Incorrect value of scheduled_at: %s', scheduled_at)
                cleanup_fn()
        return False
コード例 #2
0
ファイル: __init__.py プロジェクト: jinty/patroni
    def initialize(self):
        # wait for etcd to be available
        while not self.touch_member():
            logger.info('waiting on DCS')
            sleep(5)

        # is data directory empty?
        if self.postgresql.data_directory_empty():
            while True:
                try:
                    cluster = self.ha.dcs.get_cluster()
                    if not cluster.is_unlocked():  # the leader already exists
                        if not cluster.initialize:
                            self.ha.dcs.initialize()
                        self.postgresql.bootstrap(cluster.leader)
                        break
                    # racing to initialize
                    elif not cluster.initialize and self.ha.dcs.initialize():
                        try:
                            self.postgresql.bootstrap()
                        except:
                            # bail out and clean the initialize flag.
                            self.cleanup_on_failed_initialization()
                            raise
                        self.ha.dcs.take_leader()
                        break
                except DCSError:
                    logger.info('waiting on DCS')
                sleep(5)
        elif self.postgresql.is_running():
            self.postgresql.load_replication_slots()
コード例 #3
0
ファイル: ha.py プロジェクト: ender74/patroni
    def process_unhealthy_cluster(self):
        """Cluster has no leader key"""

        if self.is_healthiest_node():
            if self.acquire_lock():
                failover = self.cluster.failover
                if failover:
                    if self.is_paused() and failover.leader and failover.candidate:
                        logger.info('Updating failover key after acquiring leader lock...')
                        self.dcs.manual_failover('', failover.candidate, failover.scheduled_at, failover.index)
                    else:
                        logger.info('Cleaning up failover key after acquiring leader lock...')
                        self.dcs.manual_failover('', '')
                self.load_cluster_from_dcs()
                return self.enforce_master_role('acquired session lock as a leader',
                                                'promoted self to leader by acquiring session lock')
            else:
                return self.follow('demoted self after trying and failing to obtain lock',
                                   'following new leader after trying and failing to obtain lock')
        else:
            # when we are doing manual failover there is no guaranty that new leader is ahead of any other node
            # node tagged as nofailover can be ahead of the new leader either, but it is always excluded from elections
            need_rewind = bool(self.cluster.failover) or self.patroni.nofailover
            if need_rewind:
                sleep(2)  # Give a time to somebody to take the leader lock

            if self.patroni.nofailover:
                return self.follow('demoting self because I am not allowed to become master',
                                   'following a different leader because I am not allowed to promote',
                                   need_rewind=need_rewind)
            return self.follow('demoting self because i am not the healthiest node',
                               'following a different leader because i am not the healthiest node',
                               need_rewind=need_rewind)
コード例 #4
0
ファイル: consul.py プロジェクト: zenitraM/patroni
 def create_session(self):
     while not self._session:
         try:
             self.refresh_session()
         except ConsulError:
             logger.info('waiting on consul')
             sleep(5)
コード例 #5
0
    def should_run_scheduled_action(self, action_name, scheduled_at,
                                    cleanup_fn):
        if scheduled_at and not self.is_paused():
            # If the scheduled action is in the far future, we shouldn't do anything and just return.
            # If the scheduled action is in the past, we consider the value to be stale and we remove
            # the value.
            # If the value is close to now, we initiate the scheduled action
            # Additionally, if the scheduled action cannot be executed altogether, i.e. there is an error
            # or the action is in the past - we take care of cleaning it up.
            now = datetime.datetime.now(pytz.utc)
            try:
                delta = (scheduled_at - now).total_seconds()

                if delta > self.dcs.loop_wait:
                    logger.info('Awaiting %s at %s (in %.0f seconds)',
                                action_name, scheduled_at.isoformat(), delta)
                    return False
                elif delta < -int(self.dcs.loop_wait * 1.5):
                    logger.warning('Found a stale %s value, cleaning up: %s',
                                   action_name, scheduled_at.isoformat())
                    cleanup_fn()
                    return False

                # The value is very close to now
                sleep(max(delta, 0))
                logger.info('Manual scheduled {0} at %s'.format(action_name),
                            scheduled_at.isoformat())
                return True
            except TypeError:
                logger.warning('Incorrect value of scheduled_at: %s',
                               scheduled_at)
                cleanup_fn()
        return False
コード例 #6
0
ファイル: ha.py プロジェクト: syaroslavtsev/patroni
    def process_manual_failover_from_leader(self):
        failover = self.cluster.failover

        if failover.scheduled_at:
            # If the failover is in the far future, we shouldn't do anything and just return.
            # If the failover is in the past, we consider the value to be stale and we remove
            # the value.
            # If the value is close to now, we initiate the failover
            now = datetime.datetime.now(pytz.utc)
            try:
                delta = (failover.scheduled_at - now).total_seconds()

                if delta > self.patroni.nap_time:
                    logging.info('Awaiting failover at %s (in %.0f seconds)',
                                 failover.scheduled_at.isoformat(), delta)
                    return
                elif delta < -int(self.patroni.nap_time * 1.5):
                    logger.warning(
                        'Found a stale failover value, cleaning up: %s',
                        failover.scheduled_at)
                    self.dcs.manual_failover('',
                                             '',
                                             index=self.cluster.failover.index)
                    return

                # The value is very close to now
                sleep(max(delta, 0))
                logger.info('Manual scheduled failover at {}'.format(
                    failover.scheduled_at.isoformat()))
            except TypeError:
                logger.warning('Incorrect value in of scheduled_at: %s',
                               failover.scheduled_at)

        if not failover.leader or failover.leader == self.state_handler.name:
            if not failover.candidate or failover.candidate != self.state_handler.name:
                members = [
                    m for m in self.cluster.members
                    if not failover.candidate or m.name == failover.candidate
                ]
                if self.is_failover_possible(
                        members):  # check that there are healthy members
                    self._async_executor.schedule('manual failover: demote')
                    self._async_executor.run_async(self.demote)
                    return 'manual failover: demoting myself'
                else:
                    logger.warning(
                        'manual failover: no healthy members found, failover is not possible'
                    )
            else:
                logger.warning(
                    'manual failover: I am already the leader, no need to failover'
                )
        else:
            logger.warning(
                'manual failover: leader name does not match: %s != %s',
                self.cluster.failover.leader, self.state_handler.name)

        logger.info('Trying to clean up failover key')
        self.dcs.manual_failover('', '', index=self.cluster.failover.index)
コード例 #7
0
ファイル: etcd.py プロジェクト: alkersan/patroni
 def get_etcd_client(config):
     client = None
     while not client:
         try:
             client = Client(config)
         except etcd.EtcdException:
             logger.info('waiting on etcd')
             sleep(5)
     return client
コード例 #8
0
ファイル: dcs.py プロジェクト: sean-/patroni
    def watch(self, timeout):
        """If the current node is a master it should just sleep.
        Any other node should watch for changes of leader key with a given timeout

        :param timeout: timeout in seconds
        :returns: `!True` if you would like to reschedule the next run of ha cycle"""

        sleep(timeout)
        return False
コード例 #9
0
 def get_etcd_client(config):
     client = None
     while not client:
         try:
             client = Client(config)
         except etcd.EtcdException:
             logger.info('waiting on etcd')
             sleep(5)
     return client
コード例 #10
0
ファイル: ha.py プロジェクト: vanife/patroni
    def process_sync_replication(self):
        """Process synchronous standby beahvior.

        Synchronous standbys are registered in two places postgresql.conf and DCS. The order of updating them must
        be right. The invariant that should be kept is that if a node is master and sync_standby is set in DCS,
        then that node must have synchronous_standby set to that value. Or more simple, first set in postgresql.conf
        and then in DCS. When removing, first remove in DCS, then in postgresql.conf. This is so we only consider
        promoting standbys that were guaranteed to be replicating synchronously.
        """
        if self.is_synchronous_mode():
            current = self.cluster.sync.leader and self.cluster.sync.sync_standby
            picked, allow_promote = self.state_handler.pick_synchronous_standby(
                self.cluster)
            if picked != current:
                # We need to revoke privilege from current before replacing it in the config
                if current:
                    logger.info("Removing synchronous privilege from %s",
                                current)
                    if not self.dcs.write_sync_state(
                            self.state_handler.name,
                            None,
                            index=self.cluster.sync.index):
                        logger.info(
                            'Synchronous replication key updated by someone else.'
                        )
                        return
                logger.info("Assigning synchronous standby status to %s",
                            picked)
                self.state_handler.set_synchronous_standby(picked)

                if picked and not allow_promote:
                    # Wait for PostgreSQL to enable synchronous mode and see if we can immediately set sync_standby
                    sleep(2)
                    picked, allow_promote = self.state_handler.pick_synchronous_standby(
                        self.cluster)
                if allow_promote:
                    cluster = self.dcs.get_cluster()
                    if cluster.sync.leader and cluster.sync.leader != self.state_handler.name:
                        logger.info(
                            "Synchronous replication key updated by someone else"
                        )
                        return
                    if not self.dcs.write_sync_state(self.state_handler.name,
                                                     picked,
                                                     index=cluster.sync.index):
                        logger.info(
                            "Synchronous replication key updated by someone else"
                        )
                        return
                    logger.info("Synchronous standby status assigned to %s",
                                picked)
        else:
            if self.cluster.sync.leader and self.dcs.delete_sync_state(
                    index=self.cluster.sync.index):
                logger.info("Disabled synchronous replication")
            self.state_handler.set_synchronous_standby(None)
コード例 #11
0
 def create_or_restore_session(self):
     while not self._session:
         try:
             _, member = self._client.kv.get(self.member_path)
             self._session = (member or {}).get('Session')
             if self.refresh_session():
                 self._client.kv.delete(self.member_path)
         except (ConsulException, RequestException):
             logger.info('waiting on consul')
             sleep(5)
 def demote(self, delete_leader=True):
     if delete_leader:
         self.state_handler.stop()
         self.dcs.delete_leader()
         self.touch_member()
         self.dcs.reset_cluster()
         sleep(2)  # Give a time to somebody to promote
         self.recover()
     else:
         self.state_handler.follow(None)
 def create_or_restore_session(self):
     while not self._session:
         try:
             _, member = self._client.kv.get(self.member_path)
             self._session = (member or {}).get('Session')
             if self.refresh_session():
                 self._client.kv.delete(self.member_path)
         except (ConsulException, RequestException):
             logger.info('waiting on consul')
             sleep(5)
コード例 #14
0
ファイル: ha.py プロジェクト: Jollyturns/patroni
 def demote(self, delete_leader=True):
     if delete_leader:
         self.state_handler.stop()
         self.dcs.delete_leader()
         self.touch_member()
         self.dcs.reset_cluster()
         sleep(2)  # Give a time to somebody to promote
         self.recover()
     else:
         self.state_handler.follow(None)
 def __init__(self, hosts, port, uri_path='/exhibitor/v1/cluster/list', poll_interval=300):
     self._exhibitor_port = port
     self._uri_path = uri_path
     self._poll_interval = poll_interval
     self._exhibitors = hosts
     self._master_exhibitors = hosts
     self._zookeeper_hosts = ''
     self._next_poll = None
     while not self.poll():
         logger.info('waiting on exhibitor')
         sleep(5)
コード例 #16
0
 def demote(self, delete_leader=True):
     if delete_leader:
         self.state_handler.stop()
         self.dcs.delete_leader()
         self.touch_member()
         self.dcs.reset_cluster()
         sleep(2)  # Give a time to somebody to promote
         cluster = self.dcs.get_cluster()
         node_to_follow = self._get_node_to_follow(cluster)
         self.state_handler.follow(node_to_follow, cluster.leader, True)
     else:
         self.state_handler.follow(None, None)
コード例 #17
0
ファイル: ha.py プロジェクト: syaroslavtsev/patroni
 def demote(self, delete_leader=True):
     if delete_leader:
         self.state_handler.stop()
         self.dcs.delete_leader()
         self.touch_member()
         self.dcs.reset_cluster()
         sleep(2)  # Give a time to somebody to promote
         cluster = self.dcs.get_cluster()
         node_to_follow = self._get_node_to_follow(cluster)
         self.state_handler.follow(node_to_follow, cluster.leader, True)
     else:
         self.state_handler.follow(None, None)
コード例 #18
0
ファイル: ha.py プロジェクト: rnz/patroni
 def demote(self, delete_leader=True):
     if delete_leader:
         self.state_handler.stop()
         self.state_handler.set_role('demoted')
         self.dcs.delete_leader()
         self.dcs.reset_cluster()
         sleep(2)  # Give a time to somebody to take the leader lock
         cluster = self.dcs.get_cluster()
         node_to_follow = self._get_node_to_follow(cluster)
         self.state_handler.follow(node_to_follow, cluster.leader, recovery=True, need_rewind=True)
     else:
         self.state_handler.follow(None, None)
コード例 #19
0
ファイル: ha.py プロジェクト: Jollyturns/patroni
    def process_manual_failover_from_leader(self):
        failover = self.cluster.failover

        if failover.scheduled_at:
            # If the failover is in the far future, we shouldn't do anything and just return.
            # If the failover is in the past, we consider the value to be stale and we remove
            # the value.
            # If the value is close to now, we initiate the failover
            now = datetime.datetime.now(pytz.utc)
            try:
                delta = (failover.scheduled_at - now).total_seconds()

                if delta > self.patroni.nap_time:
                    logging.info('Awaiting failover at %s (in %.0f seconds)', failover.scheduled_at.isoformat(), delta)
                    return
                elif delta < - int(self.patroni.nap_time * 1.5):
                    logger.warning('Found a stale failover value, cleaning up: %s', failover.scheduled_at)
                    self.dcs.manual_failover('', '', index=self.cluster.failover.index)
                    return

                # The value is very close to now
                sleep(max(delta, 0))
                logger.info('Manual scheduled failover at {}'.format(failover.scheduled_at.isoformat()))
            except TypeError:
                logger.warning('Incorrect value in of scheduled_at: %s', failover.scheduled_at)

        if not failover.leader or failover.leader == self.state_handler.name:
            if not failover.candidate or failover.candidate != self.state_handler.name:
                members = [m for m in self.cluster.members if not failover.candidate or m.name == failover.candidate]
                if self.is_failover_possible(members):  # check that there are healthy members
                    self._async_executor.schedule('manual failover: demote')
                    self._async_executor.run_async(self.demote)
                    return 'manual failover: demoting myself'
                else:
                    logger.warning('manual failover: no healthy members found, failover is not possible')
            else:
                logger.warning('manual failover: I am already the leader, no need to failover')
        else:
            logger.warning('manual failover: leader name does not match: %s != %s',
                           self.cluster.failover.leader, self.state_handler.name)

        logger.info('Trying to clean up failover key')
        self.dcs.manual_failover('', '', index=self.cluster.failover.index)
コード例 #20
0
ファイル: postgresql.py プロジェクト: jinty/patroni
 def query(self, sql, *params):
     max_attempts = 0
     while True:
         ex = None
         try:
             cursor = self._cursor()
             cursor.execute(sql, params)
             return cursor
         except psycopg2.InterfaceError as e:
             ex = e
         except psycopg2.OperationalError as e:
             if self._connection and self._connection.closed == 0:
                 raise e
             ex = e
         if ex:
             self.disconnect()
             max_attempts += 1
             if max_attempts >= 3:
                 raise ex
             sleep(5)
コード例 #21
0
 def test_sleep(self):
     sleep(0.01)
コード例 #22
0
ファイル: dcs.py プロジェクト: jinty/patroni
 def watch(self, timeout):
     sleep(timeout)
コード例 #23
0
 def test_sleep(self):
     self.assertIsNone(sleep(0.01))
コード例 #24
0
ファイル: test_utils.py プロジェクト: jinty/patroni
 def test_sleep(self):
     time.sleep = time_sleep
     sleep(0.01)
コード例 #25
0
ファイル: test_utils.py プロジェクト: alexclear/patroni
 def test_sleep(self):
     sleep(0.01)