class TestAsyncExecutor(unittest.TestCase):
    def setUp(self):
        self.a = AsyncExecutor(Mock(), Mock())

    @patch.object(Thread, 'start', Mock())
    def test_run_async(self):

    def test_run(self):

    def test_cancel(self):
class TestAsyncExecutor(unittest.TestCase):

    def setUp(self):
        self.a = AsyncExecutor(Mock(), Mock())

    @patch.object(Thread, 'start', Mock())
    def test_run_async(self):

    def test_run(self):

    def test_cancel(self):
コード例 #3
class Ha(object):
    def __init__(self, patroni):
        self.patroni = patroni
        self.state_handler = patroni.postgresql
        self.dcs = patroni.dcs
        self.cluster = None
        self.old_cluster = None
        self.recovering = False
        self._async_executor = AsyncExecutor()

    def is_paused(self):
        return self.cluster and self.cluster.is_paused()

    def load_cluster_from_dcs(self):
        cluster = self.dcs.get_cluster()

        # We want to keep the state of cluster when it was healthy
        if not cluster.is_unlocked() or not self.old_cluster:
            self.old_cluster = cluster
        self.cluster = cluster

    def acquire_lock(self):
        return self.dcs.attempt_to_acquire_leader()

    def update_lock(self, write_leader_optime=False):
        ret = self.dcs.update_leader()
        if ret and write_leader_optime:
        return ret

    def has_lock(self):
        lock_owner = self.cluster.leader and self.cluster.leader.name
        logger.info('Lock owner: %s; I am %s', lock_owner,
        return lock_owner == self.state_handler.name

    def touch_member(self):
        data = {
            'conn_url': self.state_handler.connection_string,
            'api_url': self.patroni.api.connection_string,
            'state': self.state_handler.state,
            'role': self.state_handler.role
        if self.patroni.tags:
            data['tags'] = self.patroni.tags
        if self.state_handler.pending_restart:
            data['pending_restart'] = True
        if not self._async_executor.busy and data['state'] in [
                'running', 'restarting', 'starting'
                data['xlog_location'] = self.state_handler.xlog_position(
        if self.patroni.scheduled_restart:
            scheduled_restart_data = self.patroni.scheduled_restart.copy()
            scheduled_restart_data['schedule'] = scheduled_restart_data[
            data['scheduled_restart'] = scheduled_restart_data

        self.dcs.touch_member(json.dumps(data, separators=(',', ':')))

    def clone(self, clone_member=None, msg='(without leader)'):
        if self.state_handler.clone(clone_member):
            logger.info('bootstrapped %s', msg)
            cluster = self.dcs.get_cluster()
            node_to_follow = self._get_node_to_follow(cluster)
            self.state_handler.follow(node_to_follow, cluster.leader, True)
            logger.error('failed to bootstrap %s', msg)

    def bootstrap(self):
        if not self.cluster.is_unlocked():  # cluster already has leader
            clone_member = self.cluster.get_clone_member(
            member_role = 'leader' if clone_member == self.cluster.leader else 'replica'
            msg = "from {0} '{1}'".format(member_role, clone_member.name)
            self._async_executor.schedule('bootstrap {0}'.format(msg))
                                           args=(clone_member, msg))
            return 'trying to bootstrap {0}'.format(msg)
        # no initialize key and node is allowed to be master and has 'bootstrap' section in a configuration file
        elif self.cluster.initialize is None and not self.patroni.nofailover and 'bootstrap' in self.patroni.config:
            if self.dcs.initialize(create_new=True):  # race for initialization
                except:  # initdb or start failed
                    # remove initialization key and give a chance to other members
                        "removing initialize key after failed attempt to initialize the cluster"
                               separators=(',', ':')))
                return 'initialized a new cluster'
                return 'failed to acquire initialize lock'
            if self.state_handler.can_create_replica_without_replication_connection(
                return "trying to bootstrap (without leader)"
            return 'waiting for leader to bootstrap'

    def recover(self):
        self.recovering = True
        return self.follow(
            "starting as readonly because i had the session lock",
            "starting as a secondary", True, True)

    def _get_node_to_follow(self, cluster):
        # determine the node to follow. If replicatefrom tag is set,
        # try to follow the node mentioned there, otherwise, follow the leader.
        if not self.patroni.replicatefrom or self.patroni.replicatefrom == self.state_handler.name:
            node_to_follow = cluster.leader
            node_to_follow = cluster.get_member(self.patroni.replicatefrom)

        return node_to_follow if node_to_follow and node_to_follow.name != self.state_handler.name else None

    def follow(self,
        if refresh:

        if recovery:
            ret = demote_reason if self.has_lock() else follow_reason
            is_leader = self.state_handler.is_leader()
            ret = demote_reason if is_leader else follow_reason

        node_to_follow = self._get_node_to_follow(self.cluster)

        if self.is_paused() and not self.state_handler.need_rewind:
            self.state_handler.set_role('master' if is_leader else 'replica')
            if is_leader:
                return 'continue to run as master without lock'
            elif not node_to_follow:
                return 'no action'

        self.state_handler.follow(node_to_follow, self.cluster.leader,
                                  recovery, self._async_executor, need_rewind)

        return ret

    def enforce_master_role(self, message, promote_message):
        if self.state_handler.is_leader(
        ) or self.state_handler.role == 'master':
            # Inform the state handler about its master role.
            # It may be unaware of it if postgres is promoted manually.
            return message
            return promote_message

    def fetch_node_status(member):
        """This function perform http get request on member.api_url and fetches its status
        :returns: tuple(`member`, reachable, in_recovery, xlog_location)

        reachable - `!False` if the node is not reachable or is not responding with correct JSON
        in_recovery - `!True` if pg_is_in_recovery() == true
        xlog_location - value of `replayed_location` or `location` from JSON, dependin on its role.
        tags - dictionary with values of different tags (i.e. nofailover)

            response = requests.get(member.api_url, timeout=2, verify=False)
            logger.info('Got response from %s %s: %s', member.name,
                        member.api_url, response.content)
            json = response.json()
            is_master = json['role'] == 'master'
            xlog_location = None if is_master else json['xlog'][
            return (member, True, not is_master, xlog_location,
                    json.get('tags', {}))
        except Exception as e:
            logger.warning("request failed: GET %s (%s)", member.api_url, e)
        return (member, False, None, 0, {})

    def fetch_nodes_statuses(self, members):
        pool = ThreadPool(len(members))
        results = pool.map(self.fetch_node_status,
                           members)  # Run API calls on members in parallel
        return results

    def _is_healthiest_node(self, members, check_replication_lag=True):
        """This method tries to determine whether I am healthy enough to became a new leader candidate or not."""

        if check_replication_lag and not self.state_handler.check_replication_lag(
            return False  # Too far behind last reported xlog location on master

        # Prepare list of nodes to run check against
        members = [
            m for m in members if m.name != self.state_handler.name
            and not m.nofailover and m.api_url

        if members:
            my_xlog_location = self.state_handler.xlog_position()
            for member, reachable, in_recovery, xlog_location, tags in self.fetch_nodes_statuses(
                if reachable and not tags.get(
                        False):  # If the node is unreachable it's not healhy
                    if not in_recovery:
                        logger.warning('Master (%s) is still alive',
                        return False
                    if my_xlog_location < xlog_location:
                        return False
        return True

    def is_failover_possible(self, members):
        ret = False
        members = [
            m for m in members if m.name != self.state_handler.name
            and not m.nofailover and m.api_url
        if members:
            for member, reachable, _, _, tags in self.fetch_nodes_statuses(
                if reachable and not tags.get('nofailover', False):
                    ret = True  # TODO: check xlog_location
                elif not reachable:
                    logger.info('Member %s is not reachable', member.name)
                elif tags.get('nofailover', False):
                    logger.info('Member %s is not allowed to promote',
            logger.warning('manual failover: members list is empty')
        return ret

    def manual_failover_process_no_leader(self):
        failover = self.cluster.failover
        if failover.candidate:  # manual failover to specific member
            if failover.candidate == self.state_handler.name:  # manual failover to me
                return True
            elif self.is_paused():
                # Remove failover key if the node to failover has terminated to avoid waiting for it indefinitely
                # In order to avoid attempts to delete this key from all nodes only the master is allowed to do it.
                if (not self.cluster.get_member(failover.candidate,
                        and self.state_handler.is_leader()):
                        "manual failover: removing failover key because failover candidate is not running"
                    return None
                return False

            # find specific node and check that it is healthy
            member = self.cluster.get_member(failover.candidate,
            if member:
                member, reachable, _, _, tags = self.fetch_node_status(member)
                if reachable and not tags.get('nofailover',
                                              False):  # node is healthy
                    logger.info('manual failover: to %s, i am %s', member.name,
                    return False
                # we wanted to failover to specific member but it is not healthy
                if not reachable:
                    logger.warning('manual failover: member %s is unhealthy',
                elif tags.get('nofailover', False):
                        'manual failover: member %s is not allowed to promote',

            # at this point we should consider all members as a candidates for failover
            # i.e. we assume that failover.candidate is None
        elif self.is_paused():
            return False

        # try to pick some other members to failover and check that they are healthy
        if failover.leader:
            if self.state_handler.name == failover.leader:  # I was the leader
                # exclude me and desired member which is unhealthy (failover.candidate can be None)
                members = [
                    m for m in self.cluster.members
                    if m.name not in (failover.candidate, failover.leader)
                if self.is_failover_possible(
                        members):  # check that there are healthy members
                    return False
                else:  # I was the leader and it looks like currently I am the only healthy member
                    return True

            # at this point we assume that our node is a candidate for a failover among all nodes except former leader

        # exclude former leader from the list (failover.leader can be None)
        members = [
            m for m in self.cluster.members if m.name != failover.leader
        return self._is_healthiest_node(members, check_replication_lag=False)

    def is_healthiest_node(self):
        if self.is_paused() and not self.patroni.nofailover and \
                self.cluster.failover and not self.cluster.failover.scheduled_at:
            ret = self.manual_failover_process_no_leader()
            if ret is not None:  # continue if we just deleted the stale failover key as a master
                return ret

        if self.state_handler.is_leader():  # leader is always the healthiest
            return True

        if self.is_paused():
            return False

        if self.patroni.nofailover:  # nofailover tag makes node always unhealthy
            return False

        if self.cluster.failover:
            return self.manual_failover_process_no_leader()

        # run usual health check
        members = {
            m.name: m
            for m in self.cluster.members + self.old_cluster.members
        return self._is_healthiest_node(members.values())

    def demote(self, delete_leader=True):
        if delete_leader:
            sleep(2)  # Give a time to somebody to take the leader lock
            cluster = self.dcs.get_cluster()
            node_to_follow = self._get_node_to_follow(cluster)
            self.state_handler.follow(None, None)

    def should_run_scheduled_action(self, action_name, scheduled_at,
        if scheduled_at and not self.is_paused():
            # If the scheduled action is in the far future, we shouldn't do anything and just return.
            # If the scheduled action is in the past, we consider the value to be stale and we remove
            # the value.
            # If the value is close to now, we initiate the scheduled action
            # Additionally, if the scheduled action cannot be executed altogether, i.e. there is an error
            # or the action is in the past - we take care of cleaning it up.
            now = datetime.datetime.now(pytz.utc)
                delta = (scheduled_at - now).total_seconds()

                if delta > self.dcs.loop_wait:
                    logger.info('Awaiting %s at %s (in %.0f seconds)',
                                action_name, scheduled_at.isoformat(), delta)
                    return False
                elif delta < -int(self.dcs.loop_wait * 1.5):
                    logger.warning('Found a stale %s value, cleaning up: %s',
                                   action_name, scheduled_at.isoformat())
                    return False

                # The value is very close to now
                sleep(max(delta, 0))
                logger.info('Manual scheduled {0} at %s'.format(action_name),
                return True
            except TypeError:
                logger.warning('Incorrect value of scheduled_at: %s',
        return False

    def process_manual_failover_from_leader(self):
        failover = self.cluster.failover

        if (failover.scheduled_at and not self.should_run_scheduled_action(
                "failover", failover.scheduled_at,
                lambda: self.dcs.manual_failover('', '', index=failover.index))

        if not failover.leader or failover.leader == self.state_handler.name:
            if not failover.candidate or failover.candidate != self.state_handler.name:
                if not failover.candidate and self.is_paused():
                        'Failover is possible only to a specific candidate in a paused state'
                    members = [
                        m for m in self.cluster.members if
                        not failover.candidate or m.name == failover.candidate
                    if self.is_failover_possible(
                            members):  # check that there are healthy members
                            'manual failover: demote')
                        return 'manual failover: demoting myself'
                            'manual failover: no healthy members found, failover is not possible'
                    'manual failover: I am already the leader, no need to failover'
                'manual failover: leader name does not match: %s != %s',
                failover.leader, self.state_handler.name)

        logger.info('Cleaning up failover key')
        self.dcs.manual_failover('', '', index=failover.index)

    def process_unhealthy_cluster(self):
        """Cluster has no leader key"""

        if self.is_healthiest_node():
            if self.acquire_lock():
                failover = self.cluster.failover
                if failover:
                    if self.is_paused(
                    ) and failover.leader and failover.candidate:
                            'Updating failover key after acquiring leader lock...'
                        self.dcs.manual_failover('', failover.candidate,
                            'Cleaning up failover key after acquiring leader lock...'
                        self.dcs.manual_failover('', '')
                return self.enforce_master_role(
                    'acquired session lock as a leader',
                    'promoted self to leader by acquiring session lock')
                return self.follow(
                    'demoted self after trying and failing to obtain lock',
                    'following new leader after trying and failing to obtain lock'
            # when we are doing manual failover there is no guaranty that new leader is ahead of any other node
            # node tagged as nofailover can be ahead of the new leader either, but it is always excluded from elections
            need_rewind = bool(
                self.cluster.failover) or self.patroni.nofailover
            if need_rewind:
                sleep(2)  # Give a time to somebody to take the leader lock

            if self.patroni.nofailover:
                return self.follow(
                    'demoting self because I am not allowed to become master',
                    'following a different leader because I am not allowed to promote',
            return self.follow(
                'demoting self because i am not the healthiest node',
                'following a different leader because i am not the healthiest node',

    def process_healthy_cluster(self):
        if self.has_lock():
            if self.cluster.failover and (not self.is_paused()
                                          or self.state_handler.is_leader()):
                msg = self.process_manual_failover_from_leader()
                if msg is not None:
                    return msg

            if self.is_paused() and not self.state_handler.is_leader():
                if self.cluster.failover and self.cluster.failover.candidate == self.state_handler.name:
                    return 'waiting to become master after promote...'

                return 'removed leader lock because postgres is not running as master'

            if self.update_lock(True):
                return self.enforce_master_role(
                    'no action.  i am the leader with the lock',
                    'promoted self to leader because i had the session lock')
                # Either there is no connection to DCS or someone else acquired the lock
                logger.error('failed to update leader lock')
                return 'demoted self because failed to update leader lock in DCS'
            logger.info('does not have lock')
        return self.follow(
            'demoting self because i do not have the lock and i was a leader',
            'no action.  i am a secondary and i am following a leader', False)

    def evaluate_scheduled_restart(self):
        # restart if we need to
        restart_data = self.future_restart_scheduled()
        if restart_data:
            recent_time = self.state_handler.postmaster_start_time()
            request_time = restart_data['postmaster_start_time']
            # check if postmaster start time has changed since the last restart
            if recent_time and request_time and recent_time != request_time:
                    "Cancelling scheduled restart: postgres restart has already happened at %s",
                return None

        if (restart_data and self.should_run_scheduled_action(
                'restart', restart_data['schedule'],
                ret, message = self.restart(restart_data, run_async=True)
                if not ret:
                    logger.warning("Scheduled restart: %s", message)
                    return None
                return message

    def restart_matches(self, role, postgres_version, pending_restart):
        reason_to_cancel = ""
        # checking the restart filters here seem to be less ugly than moving them into the
        # run_scheduled_action.
        if role and role != self.state_handler.role:
            reason_to_cancel = "host role mismatch"

        if (postgres_version and
                self.state_handler.postgres_version_to_int(postgres_version) <=
            reason_to_cancel = "postgres version mismatch"

        if pending_restart and not self.state_handler.pending_restart:
            reason_to_cancel = "pending restart flag is not set"

        if not reason_to_cancel:
            return True
            logger.info("not proceeding with the restart: %s",
        return False

    def schedule_future_restart(self, restart_data):
        with self._async_executor:
            if not self.patroni.scheduled_restart:
                self.patroni.scheduled_restart = restart_data
                return True
        return False

    def delete_future_restart(self):
        ret = False
        with self._async_executor:
            if self.patroni.scheduled_restart:
                self.patroni.scheduled_restart = {}
                ret = True
        return ret

    def future_restart_scheduled(self):
        return self.patroni.scheduled_restart.copy() if (
            and isinstance(self.patroni.scheduled_restart, dict)) else None

    def restart_scheduled(self):
        return self._async_executor.scheduled_action == 'restart'

    def restart(self, restart_data=None, run_async=False):
        """ conditional and unconditional restart """
        if (restart_data and isinstance(restart_data, dict) and
                not self.restart_matches(restart_data.get('role'),
                                         ('restart_pending' in restart_data))):
            return (False, "restart conditions are not satisfied")

        with self._async_executor:
            prev = self._async_executor.schedule('restart')
            if prev is not None:
                return (False, prev + ' already in progress')

        if run_async:
            return (True, 'restart initiated')
        elif self._async_executor.run(self.state_handler.restart):
            return (True, 'restarted successfully')
            return (False, 'restart failed')

    def _do_reinitialize(self, cluster):

        clone_member = self.cluster.get_clone_member(self.state_handler.name)
        member_role = 'leader' if clone_member == self.cluster.leader else 'replica'
                   "from {0} '{1}'".format(member_role, clone_member.name))

    def reinitialize(self):
        with self._async_executor:

            if self.cluster.is_unlocked():
                return 'Cluster has no leader, can not reinitialize'

            if self.cluster.leader.name == self.state_handler.name:
                return 'I am the leader, can not reinitialize'

            action = self._async_executor.schedule('reinitialize',
            if action is not None:
                return '{0} already in progress'.format(action)

                                       args=(self.cluster, ))

    def handle_long_action_in_progress(self):
        if self.has_lock():
            if self.update_lock():
                return 'updated leader lock during ' + self._async_executor.scheduled_action
                return 'failed to update leader lock during ' + self._async_executor.scheduled_action
        elif self.cluster.is_unlocked():
            return 'not healthy enough for leader race'
            return self._async_executor.scheduled_action + ' in progress'

    def sysid_valid(sysid):
        # sysid does tv_sec << 32, where tv_sec is the number of seconds sine 1970,
        # so even 1 << 32 would have 10 digits.
        sysid = str(sysid)
        return len(sysid) >= 10 and sysid.isdigit()

    def post_recover(self):
        if not self.state_handler.is_running():
            if self.has_lock():
                return 'removed leader key after trying and failing to start postgres'
            return 'failed to start postgres'
        return None

    def _run_cycle(self):


            # cluster has leader key but not initialize key
            if not (self.cluster.is_unlocked() or self.sysid_valid(
                    self.cluster.initialize)) and self.has_lock():
                    create_new=(self.cluster.initialize is None),

            if not (self.cluster.is_unlocked() or self.cluster.config
                    and self.cluster.config.data) and self.has_lock():
                               separators=(',', ':')))

            if self._async_executor.busy:
                return self.handle_long_action_in_progress()

            # we've got here, so any async action has finished. Check if we tried to recover and failed
            if self.recovering and not self.state_handler.need_rewind:
                self.recovering = False
                msg = self.post_recover()
                if msg is not None:
                    return msg

            # is data directory empty?
            if self.state_handler.data_directory_empty():
                return self.bootstrap()  # new node
            # "bootstrap", but data directory is not empty
            elif not self.sysid_valid(
                    self.cluster.initialize) and self.cluster.is_unlocked(
                    ) and not self.is_paused():
                    create_new=(self.cluster.initialize is None),
                # check if we are allowed to join
                if self.sysid_valid(
                ) and self.cluster.initialize != self.state_handler.sysid:
                        "system ID mismatch, node %s belongs to a different cluster: %s != %s",
                        self.state_handler.name, self.cluster.initialize,

            if not self.state_handler.is_healthy():
                if self.is_paused():
                    if self.has_lock():
                        return 'removed leader lock because postgres is not running'
                    elif not self.state_handler.need_rewind:
                        return 'postgres is not running'

                # try to start dead postgres
                return self.recover()

                if self.cluster.is_unlocked():
                    return self.process_unhealthy_cluster()
                    msg = self.evaluate_scheduled_restart()
                    if msg is not None:
                        return msg
                    return self.process_healthy_cluster()
                # we might not have a valid PostgreSQL connection here if another thread
                # stops PostgreSQL, therefore, we only reload replication slots if no
                # asynchronous processes are running (should be always the case for the master)
                if not self._async_executor.busy:
                    if not self.state_handler.cb_called:
        except DCSError:
            logger.error('Error communicating with DCS')
            if not self.is_paused() and self.state_handler.is_running(
            ) and self.state_handler.is_leader():
                return 'demoted self because DCS is not accessible and i was a leader'
            return 'DCS is not accessible'
        except (psycopg2.Error, PostgresConnectionException):
            return 'Error communicating with PostgreSQL. Will try again later'

    def run_cycle(self):
        with self._async_executor:
            info = self._run_cycle()
            return (self.is_paused() and 'PAUSE: ' or '') + info
class Ha:

    def __init__(self, patroni):
        self.patroni = patroni
        self.state_handler = patroni.postgresql
        self.dcs = patroni.dcs
        self.cluster = None
        self.old_cluster = None
        self._async_executor = AsyncExecutor()

    def load_cluster_from_dcs(self):
        cluster = self.dcs.get_cluster()

        # We want to keep the state of cluster when it was healhy
        if not cluster.is_unlocked() or not self.old_cluster:
            self.old_cluster = cluster
        self.cluster = cluster

    def acquire_lock(self):
        return self.dcs.attempt_to_acquire_leader()

    def update_lock(self):
        ret = self.dcs.update_leader()
        if ret:
        return ret

    def has_lock(self):
        lock_owner = self.cluster.leader and self.cluster.leader.name
        logger.info('Lock owner: %s; I am %s', lock_owner, self.state_handler.name)
        return lock_owner == self.state_handler.name

    def touch_member(self):
        data = {
            'conn_url': self.state_handler.connection_string,
            'api_url': self.patroni.api.connection_string,
            'state': self.state_handler.state,
            'role': self.state_handler.role,
            'tags': self.patroni.tags
        if data['state'] in ['running', 'restarting', 'starting']:
                data['xlog_location'] = self.state_handler.xlog_position()
        self.dcs.touch_member(json.dumps(data, separators=(',', ':')))

    def copy_backup_from_leader(self, leader):
        if self.state_handler.bootstrap(leader):
            logger.info('bootstrapped from leader')
            logger.error('failed to bootstrap from leader')

    def bootstrap(self):
        if not self.cluster.is_unlocked():  # cluster already has leader
            self._async_executor.schedule('bootstrap from leader')
            self._async_executor.run_async(self.copy_backup_from_leader, args=(self.cluster.leader, ))
            return 'trying to bootstrap from leader'
        elif not self.cluster.initialize and not self.patroni.nofailover:  # no initialize key
            if self.dcs.initialize(create_new=True):  # race for initialization
                    self.dcs.initialize(create_new=False, sysid=self.state_handler.sysid)
                except:  # initdb or start failed
                    # remove initialization key and give a chance to other members
                    logger.info("removing initialize key after failed attempt to initialize the cluster")
                return 'initialized a new cluster'
                return 'failed to acquire initialize lock'
            return 'waiting for leader to bootstrap'

    def recover(self):
        has_lock = self.has_lock()

        # try to see if we are the former master that crashed. If so - we likely need to run pg_rewind
        # in order to join the former standby being promoted.
        pg_controldata = self.state_handler.controldata()
        if not has_lock and pg_controldata and\
                pg_controldata.get('Database cluster state', '') == 'in production':  # crashed master

        # XXX: follow the leader calls stop, which might take quite some time.
        # perhaps we should run sync asynchronously
        # (we still need the exit code from follow_the_leader)
        ret = self.state_handler.follow_the_leader(None if has_lock else self.cluster.leader, recovery=True)
        if not ret:
            if not has_lock:
                return 'failed to start postgres'
            return 'removed leader key after trying and failing to start postgres'
        if not has_lock:
            return 'started as a secondary'
        logger.info('started as readonly because i had the session lock')

    def follow_the_leader(self, demote_reason, follow_reason, refresh=True):
        refresh and self.load_cluster_from_dcs()
        ret = demote_reason if self.state_handler.is_leader() else follow_reason
        leader = self.cluster.leader
        leader = None if (leader and leader.name) == self.state_handler.name else leader
        if not self.state_handler.check_recovery_conf(leader):
            self._async_executor.schedule('changing primary_conninfo and restarting')
            self._async_executor.run_async(self.state_handler.follow_the_leader, (leader, ))
        return ret

    def enforce_master_role(self, message, promote_message):
        if self.state_handler.is_leader() or self.state_handler.role == 'master':
            return message
            return promote_message

    def fetch_node_status(member):
        """This function perform http get request on member.api_url and fetches its status
        :returns: tuple(`member`, reachable, in_recovery, xlog_location)

        reachable - `!False` if the node is not reachable or is not responding with correct JSON
        in_recovery - `!True` if pg_is_in_recovery() == true
        xlog_location - value of `replayed_location` or `location` from JSON, dependin on its role.
        tags - dictionary with values of different tags (i.e. nofailover)

            response = requests.get(member.api_url, timeout=2, verify=False)
            logger.info('Got response from %s %s: %s', member.name, member.api_url, response.content)
            json = response.json()
            is_master = json['role'] == 'master'
            xlog_location = json['xlog']['location' if is_master else 'replayed_location']
            tags = json.get('tags', dict())
            return (member, True, not is_master, xlog_location, tags)
            logging.exception('request failed: GET %s', member.api_url)
        return (member, False, None, 0, {})

    def fetch_nodes_statuses(self, members):
        pool = ThreadPool(len(members))
        results = pool.map(self.fetch_node_status, members)  # Run API calls on members in parallel
        return results

    def _is_healthiest_node(self, members, check_replication_lag=True):
        """This method tries to determine whether I am healthy enough to became a new leader candidate or not."""

        if self.state_handler.is_leader():
            return True

        if self.patroni.nofailover is True:
            return False

        if check_replication_lag and not self.state_handler.check_replication_lag(self.cluster.last_leader_operation):
            return False  # Too far behind last reported xlog location on master

        # Prepare list of nodes to run check against
        members = [m for m in members if m.name != self.state_handler.name and not m.nofailover and m.api_url]

        if members:
            my_xlog_location = self.state_handler.xlog_position()
            for member, reachable, in_recovery, xlog_location, tags in self.fetch_nodes_statuses(members):
                if reachable and not tags.get('nofailover', False):  # If the node is unreachable it's not healhy
                    if not in_recovery:
                        logger.warning('Master (%s) is still alive', member.name)
                        return False
                    if my_xlog_location < xlog_location:
                        return False
        return True

    def is_failover_possible(self, members):
        ret = False
        members = [m for m in members if m.name != self.state_handler.name and not m.nofailover and m.api_url]
        if members:
            for member, reachable, in_recovery, xlog_location, tags in self.fetch_nodes_statuses(members):
                if reachable and not tags.get('nofailover', False):
                    ret = True  # TODO: check xlog_location
                elif not reachable:
                    logger.info('Member %s is not reachable', member.name)
                elif tags.get('nofailover', False):
                    logger.info('Member %s is not allowed to promote', member.name)
            logger.warning('manual failover: members list is empty')
        return ret

    def manual_failover_process_no_leader(self):
        failover = self.cluster.failover
        if failover.member:  # manual failover to specific member
            if failover.member == self.state_handler.name:  # manual failover to me
                return True

            # find specific node and check that it is healthy
            members = [m for m in self.cluster.members if m.name == failover.member]
            if members:
                member, reachable, in_recovery, xlog_location, tags = self.fetch_node_status(members[0])
                if reachable and not tags.get('nofailover', False):  # node is healthy
                    logger.info('manual failover: to %s, i am %s', member.name, self.state_handler.name)
                    return False
                # we wanted to failover to specific member but it is not healthy
                if not reachable:
                    logger.warning('manual failover: member %s is unhealthy', member.name)
                elif tags.get('nofailover', False):
                    logger.warning('manual failover: member %s is not allowed to promote', member.name)

            # at this point we should consider all members as a candidates for failover
            # i.e. we assume that failover.member is None

        # try to pick some other members to failover and check that they are healthy
        if failover.leader:
            if self.state_handler.name == failover.leader:  # I was the leader
                # exclude me and desired member which is unhealthy (failover.member can be None)
                members = [m for m in self.cluster.members if m.name not in (failover.member, failover.leader)]
                if self.is_failover_possible(members):  # check that there are healthy members
                    return False
                else:  # I was the leader and it looks like currently I am the only healthy member
                    return True

            # at this point we assume that our node is a candidate for a failover among all nodes except former leader

        # exclude former leader from the list (failover.leader can be None)
        members = [m for m in self.cluster.members if m.name != failover.leader]
        return self._is_healthiest_node(members, check_replication_lag=False)

    def is_healthiest_node(self):

        if self.state_handler.is_leader():  # leader is always the healthiest
            return True

        if self.patroni.nofailover:  # nofailover tag makes node always unhealthy
            return False

        if self.cluster.failover:
            return self.manual_failover_process_no_leader()

        # run usual health check
        members = {m.name: m for m in self.cluster.members + self.old_cluster.members}
        return self._is_healthiest_node(members.values())

    def demote(self, delete_leader=True):
        if delete_leader:

    def process_manual_failover_from_leader(self):
        failover = self.cluster.failover
        if not failover.leader or failover.leader == self.state_handler.name:
            if not failover.member or failover.member != self.state_handler.name:
                members = [m for m in self.cluster.members if not failover.member or m.name == failover.member]
                if self.is_failover_possible(members):  # check that there are healthy members
                    self._async_executor.schedule('manual failover: demote')
                    return 'manual failover: demoting myself'
                    logger.warning('manual failover: no healthy members found, failover is not possible')
                logger.warning('manual failover: I am already the leader, no need to failover')
            logger.warning('manual failover: leader name does not match: %s != %s',
                           self.cluster.failover.leader, self.state_handler.name)

        logger.info('Trying to clean up failover key')
        self.dcs.manual_failover('', '', self.cluster.failover.index)

    def process_unhealthy_cluster(self):
        if self.is_healthiest_node():
            if self.acquire_lock():
                if self.cluster.failover:
                    logger.info('Cleaning up failover key after acquiring leader lock...')
                    self.dcs.manual_failover('', '')
                return self.enforce_master_role('acquired session lock as a leader',
                                                'promoted self to leader by acquiring session lock')
                return self.follow_the_leader('demoted self due after trying and failing to obtain lock',
                                              'following new leader after trying and failing to obtain lock')
            if self.patroni.nofailover:
                return self.follow_the_leader('demoting self because I am not allowed to become master',
                                              'following a different leader because I am not allowed to promote')
            return self.follow_the_leader('demoting self because i am not the healthiest node',
                                          'following a different leader because i am not the healthiest node')

    def process_healthy_cluster(self):
        if self.has_lock():
            if self.cluster.failover:
                msg = self.process_manual_failover_from_leader()
                if msg is not None:
                    return msg

            if self.update_lock():
                return self.enforce_master_role('no action.  i am the leader with the lock',
                                                'promoted self to leader because i had the session lock')
                # Either there is no connection to DCS or someone else acquired the lock
                logger.error('failed to update leader lock')
            logger.info('does not have lock')
        return self.follow_the_leader('demoting self because i do not have the lock and i was a leader',
                                      'no action.  i am a secondary and i am following a leader', False)

    def schedule(self, action):
        with self._async_executor:
            return self._async_executor.schedule(action)

    def restart_scheduled(self):
        return self._async_executor.scheduled_action == 'restart'

    def schedule_reinitialize(self):
        return self.schedule('reinitialize')

    def reinitialize_scheduled(self):
        return self._async_executor.scheduled_action == 'reinitialize'

    def restart(self):
        with self._async_executor:
            prev = self._async_executor.schedule('restart', True)
            if prev is not None:
                return (False, prev + ' already in progress')
        if self._async_executor.run(self.state_handler.restart):
            return (True, 'restarted successfully')
            return (False, 'restart failed')

    def reinitialize(self, cluster):

    def process_scheduled_action(self):
        if self.reinitialize_scheduled():
            if self.cluster.is_unlocked():
                logger.error('Cluster has no leader, can not reinitialize')
            elif self.has_lock():
                logger.error('I am the leader, can not reinitialize')
                self._async_executor.run_async(self.reinitialize, args=(self.cluster, ))
                return 'reinitialize started'

    def handle_long_action_in_progress(self):
        if self.has_lock():
            if self.update_lock():
                return 'updated leader lock during ' + self._async_executor.scheduled_action
                return 'failed to update leader lock during ' + self._async_executor.scheduled_action
        elif self.cluster.is_unlocked():
            return 'not healthy enough for leader race'
            return self._async_executor.scheduled_action + ' in progress'

    def sysid_valid(self, sysid):
        # sysid does tv_sec << 32, where tv_sec is the number of seconds sine 1970,
        # so even 1 << 32 would have 10 digits.
        return str(sysid) and len(str(sysid)) >= 10 and str(sysid).isdigit()

    def _run_cycle(self):


            # cluster has leader key but not initialize key
            if not self.cluster.is_unlocked() and not self.sysid_valid(self.cluster.initialize) and self.has_lock():
                self.dcs.initialize(create_new=(self.cluster.initialize is None), sysid=self.state_handler.sysid)

            if self._async_executor.busy:
                return self.handle_long_action_in_progress()

            # currently it can trigger only reinitialize
            msg = self.process_scheduled_action()
            if msg is not None:
                return msg

            # is data directory empty?
            if self.state_handler.data_directory_empty():
                return self.bootstrap()  # new node
            # "bootstrap", but data directory is not empty
            elif not self.sysid_valid(self.cluster.initialize) and self.cluster.is_unlocked():
                self.dcs.initialize(create_new=(self.cluster.initialize is None), sysid=self.state_handler.sysid)
                # check if we are allowed to join
                if self.sysid_valid(self.cluster.initialize) and self.cluster.initialize != self.state_handler.sysid:
                    logger.fatal("system ID mismatch, node {0} belongs to a different cluster".

            # try to start dead postgres
            if not self.state_handler.is_healthy():
                msg = self.recover()
                if msg is not None:
                    return msg

                if self.cluster.is_unlocked():
                    return self.process_unhealthy_cluster()
                    return self.process_healthy_cluster()
        except DCSError:
            logger.error('Error communicating with DCS')
            if self.state_handler.is_running() and self.state_handler.is_leader():
                return 'demoted self because DCS is not accessible and i was a leader'
        except (psycopg2.Error, PostgresConnectionException):
            logger.exception('Error communicating with Postgresql. Will try again later')

    def run_cycle(self):
        with self._async_executor:
            return self._run_cycle()
class Ha(object):
    def __init__(self, patroni):
        self.patroni = patroni
        self.state_handler = patroni.postgresql
        self.dcs = patroni.dcs
        self.cluster = None
        self.old_cluster = None
        self.recovering = False
        self._start_timeout = None
        self._async_executor = AsyncExecutor(self.wakeup)

        # Each member publishes various pieces of information to the DCS using touch_member. This lock protects
        # the state and publishing procedure to have consistent ordering and avoid publishing stale values.
        self._member_state_lock = RLock()
        # Count of concurrent sync disabling requests. Value above zero means that we don't want to be synchronous
        # standby. Changes protected by _member_state_lock.
        self._disable_sync = 0

    def is_paused(self):
        return self.cluster and self.cluster.is_paused()

    def load_cluster_from_dcs(self):
        cluster = self.dcs.get_cluster()

        # We want to keep the state of cluster when it was healthy
        if not cluster.is_unlocked() or not self.old_cluster:
            self.old_cluster = cluster
        self.cluster = cluster

    def acquire_lock(self):
        return self.dcs.attempt_to_acquire_leader()

    def update_lock(self, write_leader_optime=False):
        ret = self.dcs.update_leader()
        if ret and write_leader_optime:
        return ret

    def has_lock(self):
        lock_owner = self.cluster.leader and self.cluster.leader.name
        logger.info("Lock owner: %s; I am %s", lock_owner, self.state_handler.name)
        return lock_owner == self.state_handler.name

    def get_effective_tags(self):
        """Return configuration tags merged with dynamically applied tags."""
        tags = self.patroni.tags.copy()
        # _disable_sync could be modified concurrently, but we don't care as attribute get and set are atomic.
        if self._disable_sync > 0:
            tags["nosync"] = True
        return tags

    def touch_member(self):
        with self._member_state_lock:
            data = {
                "conn_url": self.state_handler.connection_string,
                "api_url": self.patroni.api.connection_string,
                "state": self.state_handler.state,
                "role": self.state_handler.role,
            tags = self.get_effective_tags()
            if tags:
                data["tags"] = tags
            if self.state_handler.pending_restart:
                data["pending_restart"] = True
            if not self._async_executor.busy and data["state"] in ["running", "restarting", "starting"]:
                    data["xlog_location"] = self.state_handler.xlog_position(retry=False)
            if self.patroni.scheduled_restart:
                scheduled_restart_data = self.patroni.scheduled_restart.copy()
                scheduled_restart_data["schedule"] = scheduled_restart_data["schedule"].isoformat()
                data["scheduled_restart"] = scheduled_restart_data

            return self.dcs.touch_member(json.dumps(data, separators=(",", ":")))

    def clone(self, clone_member=None, msg="(without leader)"):
        if self.state_handler.clone(clone_member):
            logger.info("bootstrapped %s", msg)
            cluster = self.dcs.get_cluster()
            node_to_follow = self._get_node_to_follow(cluster)
            return self.state_handler.follow(node_to_follow, cluster.leader, True)
            logger.error("failed to bootstrap %s", msg)

    def bootstrap(self):
        if not self.cluster.is_unlocked():  # cluster already has leader
            clone_member = self.cluster.get_clone_member(self.state_handler.name)
            member_role = "leader" if clone_member == self.cluster.leader else "replica"
            msg = "from {0} '{1}'".format(member_role, clone_member.name)
            self._async_executor.schedule("bootstrap {0}".format(msg))
            self._async_executor.run_async(self.clone, args=(clone_member, msg))
            return "trying to bootstrap {0}".format(msg)
        # no initialize key and node is allowed to be master and has 'bootstrap' section in a configuration file
        elif self.cluster.initialize is None and not self.patroni.nofailover and "bootstrap" in self.patroni.config:
            if self.dcs.initialize(create_new=True):  # race for initialization
                    self.dcs.initialize(create_new=False, sysid=self.state_handler.sysid)
                except:  # initdb or start failed
                    # remove initialization key and give a chance to other members
                    logger.info("removing initialize key after failed attempt to initialize the cluster")
                self.dcs.set_config_value(json.dumps(self.patroni.config.dynamic_configuration, separators=(",", ":")))
                return "initialized a new cluster"
                return "failed to acquire initialize lock"
            if self.state_handler.can_create_replica_without_replication_connection():
                return "trying to bootstrap (without leader)"
            return "waiting for leader to bootstrap"

    def recover(self):
        if self.has_lock() and self.update_lock():
            timeout = self.patroni.config["master_start_timeout"]
            if timeout == 0:
                # We are requested to prefer failing over to restarting master. But see first if there
                # is anyone to fail over to.
                if self.is_failover_possible(self.cluster.members):
                    logger.info("Master crashed. Failing over.")
                    return "stopped PostgreSQL to fail over after a crash"
            timeout = None

        self.recovering = True
        return self.follow(
            "starting as readonly because i had the session lock", "starting as a secondary", True, True, None, timeout

    def _get_node_to_follow(self, cluster):
        # determine the node to follow. If replicatefrom tag is set,
        # try to follow the node mentioned there, otherwise, follow the leader.
        if not self.patroni.replicatefrom or self.patroni.replicatefrom == self.state_handler.name:
            node_to_follow = cluster.leader
            node_to_follow = cluster.get_member(self.patroni.replicatefrom)

        return node_to_follow if node_to_follow and node_to_follow.name != self.state_handler.name else None

    def follow(self, demote_reason, follow_reason, refresh=True, recovery=False, need_rewind=None, timeout=None):
        if refresh:

        if recovery:
            ret = demote_reason if self.has_lock() else follow_reason
            is_leader = self.state_handler.is_leader()
            ret = demote_reason if is_leader else follow_reason

        node_to_follow = self._get_node_to_follow(self.cluster)

        if self.is_paused() and not (self.state_handler.need_rewind and self.state_handler.can_rewind):
            self.state_handler.set_role("master" if is_leader else "replica")
            if is_leader:
                return "continue to run as master without lock"
            elif not node_to_follow:
                return "no action"

            node_to_follow, self.cluster.leader, recovery, self._async_executor, need_rewind, timeout

        return ret

    def is_synchronous_mode(self):
        return bool(self.cluster and self.cluster.config and self.cluster.config.data.get("synchronous_mode"))

    def process_sync_replication(self):
        """Process synchronous standby beahvior.

        Synchronous standbys are registered in two places postgresql.conf and DCS. The order of updating them must
        be right. The invariant that should be kept is that if a node is master and sync_standby is set in DCS,
        then that node must have synchronous_standby set to that value. Or more simple, first set in postgresql.conf
        and then in DCS. When removing, first remove in DCS, then in postgresql.conf. This is so we only consider
        promoting standbys that were guaranteed to be replicating synchronously.
        if self.is_synchronous_mode():
            current = self.cluster.sync.leader and self.cluster.sync.sync_standby
            picked, allow_promote = self.state_handler.pick_synchronous_standby(self.cluster)
            if picked != current:
                # We need to revoke privilege from current before replacing it in the config
                if current:
                    logger.info("Removing synchronous privilege from %s", current)
                    if not self.dcs.write_sync_state(self.state_handler.name, None, index=self.cluster.sync.index):
                        logger.info("Synchronous replication key updated by someone else.")
                logger.info("Assigning synchronous standby status to %s", picked)

                if picked and not allow_promote:
                    # Wait for PostgreSQL to enable synchronous mode and see if we can immediately set sync_standby
                    picked, allow_promote = self.state_handler.pick_synchronous_standby(self.cluster)
                if allow_promote:
                    cluster = self.dcs.get_cluster()
                    if cluster.sync.leader and cluster.sync.leader != self.state_handler.name:
                        logger.info("Synchronous replication key updated by someone else")
                    if not self.dcs.write_sync_state(self.state_handler.name, picked, index=cluster.sync.index):
                        logger.info("Synchronous replication key updated by someone else")
                    logger.info("Synchronous standby status assigned to %s", picked)
            if self.cluster.sync.leader and self.dcs.delete_sync_state(index=self.cluster.sync.index):
                logger.info("Disabled synchronous replication")

    def is_sync_standby(self, cluster):
        return (
            and cluster.sync.leader == cluster.leader.name
            and cluster.sync.sync_standby == self.state_handler.name

    def while_not_sync_standby(self, func):
        """Runs specified action while trying to make sure that the node is not assigned synchronous standby status.

        Tags us as not allowed to be a sync standby as we are going to go away, if we currently are wait for
        leader to notice and pick an alternative one or if the leader changes or goes away we are also free.

        If the connection to DCS fails we run the action anyway, as this is only a hint.

        There is a small race window where this function runs between a master picking us the sync standby and
        publishing it to the DCS. As the window is rather tiny consequences are holding up commits for one cycle
        period we don't worry about it here."""

        if not self.is_synchronous_mode() or self.patroni.nosync:
            return func()

        with self._member_state_lock:
            self._disable_sync += 1
            if self.touch_member():
                # Master should notice the updated value during the next cycle. We will wait double that, if master
                # hasn't noticed the value by then not disabling sync replication is not likely to matter.
                for _ in polling_loop(timeout=self.dcs.loop_wait * 2, interval=2):
                        if not self.is_sync_standby(self.dcs.get_cluster()):
                    except DCSError:
                        logger.warning("Could not get cluster state, skipping synchronous standby disable")
                    logger.info("Waiting for master to release us from synchronous standby")
                logger.warning("Updating member state failed, skipping synchronous standby disable")

            return func()
            with self._member_state_lock:
                self._disable_sync -= 1

    def enforce_master_role(self, message, promote_message):
        if self.state_handler.is_leader() or self.state_handler.role == "master":
            # Inform the state handler about its master role.
            # It may be unaware of it if postgres is promoted manually.
            return message
            if self.is_synchronous_mode():
                # Just set ourselves as the authoritative source of truth for now. We don't want to wait for standbys
                # to connect. We will try finding a synchronous standby in the next cycle.
                if not self.dcs.write_sync_state(self.state_handler.name, None, index=self.cluster.sync.index):
                    # Somebody else updated sync state, it may be due to us losing the lock. To be safe, postpone
                    # promotion until next cycle. TODO: trigger immediate retry of run_cycle
                    return "Postponing promotion because synchronous replication state was updated by somebody else"
            return promote_message

    def fetch_node_status(member):
        """This function perform http get request on member.api_url and fetches its status
        :returns: `_MemberStatus` object

            response = requests.get(member.api_url, timeout=2, verify=False)
            logger.info("Got response from %s %s: %s", member.name, member.api_url, response.content)
            return _MemberStatus.from_api_response(member, response.json())
        except Exception as e:
            logger.warning("request failed: GET %s (%s)", member.api_url, e)
        return _MemberStatus.unknown(member)

    def fetch_nodes_statuses(self, members):
        pool = ThreadPool(len(members))
        results = pool.map(self.fetch_node_status, members)  # Run API calls on members in parallel
        return results

    def is_lagging(self, xlog_location):
        """Returns if instance with an xlog should consider itself unhealthy to be promoted due to replication lag.

        :param xlog_location: Current xlog location.
        :returns True when node is lagging
        lag = (self.cluster.last_leader_operation or 0) - xlog_location
        return lag > self.state_handler.config.get("maximum_lag_on_failover", 0)

    def _is_healthiest_node(self, members, check_replication_lag=True):
        """This method tries to determine whether I am healthy enough to became a new leader candidate or not."""

        my_xlog_location = self.state_handler.xlog_position()
        if check_replication_lag and self.is_lagging(my_xlog_location):
            return False  # Too far behind last reported xlog location on master

        # Prepare list of nodes to run check against
        members = [m for m in members if m.name != self.state_handler.name and not m.nofailover and m.api_url]

        if members:
            for st in self.fetch_nodes_statuses(members):
                if st.failover_limitation() is None:
                    if not st.in_recovery:
                        logger.warning("Master (%s) is still alive", st.member.name)
                        return False
                    if my_xlog_location < st.xlog_location:
                        return False
        return True

    def is_failover_possible(self, members):
        ret = False
        members = [m for m in members if m.name != self.state_handler.name and not m.nofailover and m.api_url]
        if members:
            for st in self.fetch_nodes_statuses(members):
                not_allowed_reason = st.failover_limitation()
                if not_allowed_reason:
                    logger.info("Member %s is %s", st.member.name, not_allowed_reason)
                elif self.is_lagging(st.xlog_location):
                    logger.info("Member %s exceeds maximum replication lag", st.member.name)
                    ret = True
            logger.warning("manual failover: members list is empty")
        return ret

    def manual_failover_process_no_leader(self):
        failover = self.cluster.failover
        if failover.candidate:  # manual failover to specific member
            if failover.candidate == self.state_handler.name:  # manual failover to me
                return True
            elif self.is_paused():
                # Remove failover key if the node to failover has terminated to avoid waiting for it indefinitely
                # In order to avoid attempts to delete this key from all nodes only the master is allowed to do it.
                if (
                    not self.cluster.get_member(failover.candidate, fallback_to_leader=False)
                    and self.state_handler.is_leader()
                    logger.warning("manual failover: removing failover key because failover candidate is not running")
                    self.dcs.manual_failover("", "", index=self.cluster.failover.index)
                    return None
                return False

            # find specific node and check that it is healthy
            member = self.cluster.get_member(failover.candidate, fallback_to_leader=False)
            if member:
                st = self.fetch_node_status(member)
                not_allowed_reason = st.failover_limitation()
                if not_allowed_reason is None:  # node is healthy
                    logger.info("manual failover: to %s, i am %s", st.member.name, self.state_handler.name)
                    return False
                # we wanted to failover to specific member but it is not healthy
                logger.warning("manual failover: member %s is %s", st.member.name, not_allowed_reason)

            # at this point we should consider all members as a candidates for failover
            # i.e. we assume that failover.candidate is None
        elif self.is_paused():
            return False

        # try to pick some other members to failover and check that they are healthy
        if failover.leader:
            if self.state_handler.name == failover.leader:  # I was the leader
                # exclude me and desired member which is unhealthy (failover.candidate can be None)
                members = [m for m in self.cluster.members if m.name not in (failover.candidate, failover.leader)]
                if self.is_failover_possible(members):  # check that there are healthy members
                    return False
                else:  # I was the leader and it looks like currently I am the only healthy member
                    return True

            # at this point we assume that our node is a candidate for a failover among all nodes except former leader

        # exclude former leader from the list (failover.leader can be None)
        members = [m for m in self.cluster.members if m.name != failover.leader]
        return self._is_healthiest_node(members, check_replication_lag=False)

    def is_healthiest_node(self):
        if (
            and not self.patroni.nofailover
            and self.cluster.failover
            and not self.cluster.failover.scheduled_at
            ret = self.manual_failover_process_no_leader()
            if ret is not None:  # continue if we just deleted the stale failover key as a master
                return ret

        if self.state_handler.is_starting():  # postgresql still starting up is unhealthy
            return False

        if self.state_handler.is_leader():  # leader is always the healthiest
            return True

        if self.is_paused():
            return False

        if self.patroni.nofailover:  # nofailover tag makes node always unhealthy
            return False

        if self.cluster.failover:
            return self.manual_failover_process_no_leader()

        # When in sync mode, only last known master and sync standby are allowed to promote automatically.
        all_known_members = self.cluster.members + self.old_cluster.members
        if self.is_synchronous_mode() and self.cluster.sync.leader:
            if not self.cluster.sync.matches(self.state_handler.name):
                return False
            # pick between synchronous candidates so we minimize unnecessary failovers/demotions
            members = {m.name: m for m in all_known_members if self.cluster.sync.matches(m.name)}
            # run usual health check
            members = {m.name: m for m in all_known_members}

        return self._is_healthiest_node(members.values())

    def release_leader_key_voluntarily(self):
        logger.info("Leader key released")

    def demote(self, mode):
        """Demote PostgreSQL running as master.

        :param mode: One of offline, graceful or immediate.
            offline is used when connection to DCS is not available.
            graceful is used when failing over to another node due to user request. May only be called running async.
            immediate is used when we determine that we are not suitable for master and want to failover quickly
                without regard for data durability. May only be called synchronously.
        assert mode in ["offline", "graceful", "immediate"]
        if mode != "offline":
            if mode == "immediate":
                self.state_handler.stop("immediate", checkpoint=False)
            time.sleep(2)  # Give a time to somebody to take the leader lock
            cluster = self.dcs.get_cluster()
            node_to_follow = self._get_node_to_follow(cluster)
            if mode == "immediate":
                # We will try to start up as a standby now. If no one takes the leader lock before we finish
                # recovery we will try to promote ourselves.
                self._async_executor.schedule("waiting for failover to complete")
                    self.state_handler.follow, (node_to_follow, cluster.leader, True, None, True)
                return self.state_handler.follow(node_to_follow, cluster.leader, recovery=True, need_rewind=True)
            # Need to become unavailable as soon as possible, so initiate a stop here. However as we can't release
            # the leader key we don't care about confirming the shutdown quickly and can use a regular stop.
            self.state_handler.follow(None, None, recovery=True)

    def should_run_scheduled_action(self, action_name, scheduled_at, cleanup_fn):
        if scheduled_at and not self.is_paused():
            # If the scheduled action is in the far future, we shouldn't do anything and just return.
            # If the scheduled action is in the past, we consider the value to be stale and we remove
            # the value.
            # If the value is close to now, we initiate the scheduled action
            # Additionally, if the scheduled action cannot be executed altogether, i.e. there is an error
            # or the action is in the past - we take care of cleaning it up.
            now = datetime.datetime.now(tzutc)
                delta = (scheduled_at - now).total_seconds()

                if delta > self.dcs.loop_wait:
                    logger.info("Awaiting %s at %s (in %.0f seconds)", action_name, scheduled_at.isoformat(), delta)
                    return False
                elif delta < -int(self.dcs.loop_wait * 1.5):
                    # This means that if run_cycle gets delayed for 2.5x loop_wait we skip the
                    # scheduled action. Probably not a problem, if things are that bad we don't
                    # want to be restarting or failing over anyway.
                    logger.warning("Found a stale %s value, cleaning up: %s", action_name, scheduled_at.isoformat())
                    return False

                # The value is very close to now
                time.sleep(max(delta, 0))
                logger.info("Manual scheduled {0} at %s".format(action_name), scheduled_at.isoformat())
                return True
            except TypeError:
                logger.warning("Incorrect value of scheduled_at: %s", scheduled_at)
        return False

    def process_manual_failover_from_leader(self):
        """Checks if manual failover is requested and takes action if appropriate.

        Cleans up failover key if failover conditions are not matched.

        :returns: action message if demote was initiated, None if no action was taken"""
        failover = self.cluster.failover
        if not failover or (self.is_paused() and not self.state_handler.is_leader()):

        if failover.scheduled_at and not self.should_run_scheduled_action(
            "failover", failover.scheduled_at, lambda: self.dcs.manual_failover("", "", index=failover.index)

        if not failover.leader or failover.leader == self.state_handler.name:
            if not failover.candidate or failover.candidate != self.state_handler.name:
                if not failover.candidate and self.is_paused():
                    logger.warning("Failover is possible only to a specific candidate in a paused state")
                    members = [
                        m for m in self.cluster.members if not failover.candidate or m.name == failover.candidate
                    if self.is_failover_possible(members):  # check that there are healthy members
                        self._async_executor.schedule("manual failover: demote")
                        self._async_executor.run_async(self.demote, ("graceful",))
                        return "manual failover: demoting myself"
                        logger.warning("manual failover: no healthy members found, failover is not possible")
                logger.warning("manual failover: I am already the leader, no need to failover")
                "manual failover: leader name does not match: %s != %s", failover.leader, self.state_handler.name

        logger.info("Cleaning up failover key")
        self.dcs.manual_failover("", "", index=failover.index)

    def process_unhealthy_cluster(self):
        """Cluster has no leader key"""

        if self.is_healthiest_node():
            if self.acquire_lock():
                failover = self.cluster.failover
                if failover:
                    if self.is_paused() and failover.leader and failover.candidate:
                        logger.info("Updating failover key after acquiring leader lock...")
                        self.dcs.manual_failover("", failover.candidate, failover.scheduled_at, failover.index)
                        logger.info("Cleaning up failover key after acquiring leader lock...")
                        self.dcs.manual_failover("", "")
                return self.enforce_master_role(
                    "acquired session lock as a leader", "promoted self to leader by acquiring session lock"
                return self.follow(
                    "demoted self after trying and failing to obtain lock",
                    "following new leader after trying and failing to obtain lock",
            # when we are doing manual failover there is no guaranty that new leader is ahead of any other node
            # node tagged as nofailover can be ahead of the new leader either, but it is always excluded from elections
            need_rewind = bool(self.cluster.failover) or self.patroni.nofailover
            if need_rewind:
                time.sleep(2)  # Give a time to somebody to take the leader lock

            if self.patroni.nofailover:
                return self.follow(
                    "demoting self because I am not allowed to become master",
                    "following a different leader because I am not allowed to promote",
            return self.follow(
                "demoting self because i am not the healthiest node",
                "following a different leader because i am not the healthiest node",

    def process_healthy_cluster(self):
        if self.has_lock():
            if self.is_paused() and not self.state_handler.is_leader():
                if self.cluster.failover and self.cluster.failover.candidate == self.state_handler.name:
                    return "waiting to become master after promote..."

                return "removed leader lock because postgres is not running as master"

            if self.update_lock(True):
                msg = self.process_manual_failover_from_leader()
                if msg is not None:
                    return msg

                return self.enforce_master_role(
                    "no action.  i am the leader with the lock",
                    "promoted self to leader because i had the session lock",
                # Either there is no connection to DCS or someone else acquired the lock
                logger.error("failed to update leader lock")
                return "demoted self because failed to update leader lock in DCS"
            logger.info("does not have lock")
        return self.follow(
            "demoting self because i do not have the lock and i was a leader",
            "no action.  i am a secondary and i am following a leader",

    def evaluate_scheduled_restart(self):
        if self._async_executor.busy:  # Restart already in progress
            return None

        # restart if we need to
        restart_data = self.future_restart_scheduled()
        if restart_data:
            recent_time = self.state_handler.postmaster_start_time()
            request_time = restart_data["postmaster_start_time"]
            # check if postmaster start time has changed since the last restart
            if recent_time and request_time and recent_time != request_time:
                logger.info("Cancelling scheduled restart: postgres restart has already happened at %s", recent_time)
                return None

        if restart_data and self.should_run_scheduled_action(
            "restart", restart_data["schedule"], self.delete_future_restart
                ret, message = self.restart(restart_data, run_async=True)
                if not ret:
                    logger.warning("Scheduled restart: %s", message)
                    return None
                return message

    def restart_matches(self, role, postgres_version, pending_restart):
        reason_to_cancel = ""
        # checking the restart filters here seem to be less ugly than moving them into the
        # run_scheduled_action.
        if role and role != self.state_handler.role:
            reason_to_cancel = "host role mismatch"

        if postgres_version and self.state_handler.postgres_version_to_int(postgres_version) <= int(
            reason_to_cancel = "postgres version mismatch"

        if pending_restart and not self.state_handler.pending_restart:
            reason_to_cancel = "pending restart flag is not set"

        if not reason_to_cancel:
            return True
            logger.info("not proceeding with the restart: %s", reason_to_cancel)
        return False

    def schedule_future_restart(self, restart_data):
        with self._async_executor:
            restart_data["postmaster_start_time"] = self.state_handler.postmaster_start_time()
            if not self.patroni.scheduled_restart:
                self.patroni.scheduled_restart = restart_data
                return True
        return False

    def delete_future_restart(self):
        ret = False
        with self._async_executor:
            if self.patroni.scheduled_restart:
                self.patroni.scheduled_restart = {}
                ret = True
        return ret

    def future_restart_scheduled(self):
        return (
            if (self.patroni.scheduled_restart and isinstance(self.patroni.scheduled_restart, dict))
            else None

    def restart_scheduled(self):
        return self._async_executor.scheduled_action == "restart"

    def restart(self, restart_data, run_async=False):
        """ conditional and unconditional restart """
        assert isinstance(restart_data, dict)

        if not self.restart_matches(
            restart_data.get("role"), restart_data.get("postgres_version"), ("restart_pending" in restart_data)
            return (False, "restart conditions are not satisfied")

        with self._async_executor:
            prev = self._async_executor.schedule("restart")
            if prev is not None:
                return (False, prev + " already in progress")

            # Make the main loop to think that we were recovering dead postgres. If we fail
            # to start postgres after a specified timeout (see below), we need to remove
            # leader key (if it belong to us) rather than trying to start postgres once again.
            self.recovering = True

        # No that restart is scheduled we can set timeout for startup, it will get reset
        # once async executor runs and main loop notices PostgreSQL as up.
        timeout = restart_data.get("timeout", self.patroni.config["master_start_timeout"])

        # For non async cases we want to wait for restart to complete or timeout before returning.
        do_restart = functools.partial(self.state_handler.restart, timeout)
        if self.is_synchronous_mode() and not self.has_lock():
            do_restart = functools.partial(self.while_not_sync_standby, do_restart)

        if run_async:
            return (True, "restart initiated")
            res = self._async_executor.run(do_restart)
            if res:
                return (True, "restarted successfully")
            elif res is None:
                return (False, "postgres is still starting")
                return (False, "restart failed")

    def _do_reinitialize(self, cluster):

        clone_member = self.cluster.get_clone_member(self.state_handler.name)
        member_role = "leader" if clone_member == self.cluster.leader else "replica"
        return self.clone(clone_member, "from {0} '{1}'".format(member_role, clone_member.name))

    def reinitialize(self):
        with self._async_executor:

            if self.cluster.is_unlocked():
                return "Cluster has no leader, can not reinitialize"

            if self.cluster.leader.name == self.state_handler.name:
                return "I am the leader, can not reinitialize"

            action = self._async_executor.schedule("reinitialize", immediately=True)
            if action is not None:
                return "{0} already in progress".format(action)

        self._async_executor.run_async(self._do_reinitialize, args=(self.cluster,))

    def handle_long_action_in_progress(self):
        if self.has_lock():
            if self.update_lock():
                return "updated leader lock during " + self._async_executor.scheduled_action
                return "failed to update leader lock during " + self._async_executor.scheduled_action
        elif self.cluster.is_unlocked():
            return "not healthy enough for leader race"
            return self._async_executor.scheduled_action + " in progress"

    def sysid_valid(sysid):
        # sysid does tv_sec << 32, where tv_sec is the number of seconds sine 1970,
        # so even 1 << 32 would have 10 digits.
        sysid = str(sysid)
        return len(sysid) >= 10 and sysid.isdigit()

    def post_recover(self):
        if not self.state_handler.is_running():
            if self.has_lock():
                return "removed leader key after trying and failing to start postgres"
            return "failed to start postgres"
        return None

    def handle_starting_instance(self):
        """Starting up PostgreSQL may take a long time. In case we are the leader we may want to
        fail over to."""

        # Check if we are in startup, when paused defer to main loop for manual failovers.
        if not self.state_handler.check_for_startup() or self.is_paused():
            return None

        # state_handler.state == 'starting' here
        if self.has_lock():
            if not self.update_lock():
                logger.info("Lost lock while starting up. Demoting self.")
                return "stopped PostgreSQL while starting up because leader key was lost"

            timeout = self._start_timeout or self.patroni.config["master_start_timeout"]
            time_left = timeout - self.state_handler.time_in_state()

            if time_left <= 0:
                if self.is_failover_possible(self.cluster.members):
                    logger.info("Demoting self because master startup is taking too long")
                    return "stopped PostgreSQL because of startup timeout"
                    return "master start has timed out, but continuing to wait because failover is not possible"
                msg = self.process_manual_failover_from_leader()
                if msg is not None:
                    return msg

                return "PostgreSQL is still starting up, {0:.0f} seconds until timeout".format(time_left)
            # Use normal processing for standbys
            logger.info("Still starting up as a standby.")
            return None

    def set_start_timeout(self, value):
        """Sets timeout for starting as master before eligible for failover.

        Must be called when async_executor is busy or in the main thread."""
        self._start_timeout = value

    def _run_cycle(self):
        dcs_failed = False

            if not self.cluster.has_member(self.state_handler.name):

            # cluster has leader key but not initialize key
            if not (self.cluster.is_unlocked() or self.sysid_valid(self.cluster.initialize)) and self.has_lock():
                self.dcs.initialize(create_new=(self.cluster.initialize is None), sysid=self.state_handler.sysid)

            if not (self.cluster.is_unlocked() or self.cluster.config and self.cluster.config.data) and self.has_lock():
                self.dcs.set_config_value(json.dumps(self.patroni.config.dynamic_configuration, separators=(",", ":")))
                self.cluster = self.dcs.get_cluster()

            if self._async_executor.busy:
                return self.handle_long_action_in_progress()

            msg = self.handle_starting_instance()
            if msg is not None:
                return msg

            # we've got here, so any async action has finished.
            if self.recovering and not self.state_handler.need_rewind:
                self.recovering = False
                # Check if we tried to recover and failed
                msg = self.post_recover()
                if msg is not None:
                    return msg

            # is data directory empty?
            if self.state_handler.data_directory_empty():
                return self.bootstrap()  # new node
            # "bootstrap", but data directory is not empty
            elif not self.sysid_valid(self.cluster.initialize) and self.cluster.is_unlocked() and not self.is_paused():
                self.dcs.initialize(create_new=(self.cluster.initialize is None), sysid=self.state_handler.sysid)
                # check if we are allowed to join
                if self.sysid_valid(self.cluster.initialize) and self.cluster.initialize != self.state_handler.sysid:
                        "system ID mismatch, node %s belongs to a different cluster: %s != %s",

            if not self.state_handler.is_healthy():
                if self.is_paused():
                    if self.has_lock():
                        return "removed leader lock because postgres is not running"
                    elif not (self.state_handler.need_rewind and self.state_handler.can_rewind):
                        return "postgres is not running"

                # try to start dead postgres
                return self.recover()

                if self.cluster.is_unlocked():
                    return self.process_unhealthy_cluster()
                    msg = self.process_healthy_cluster()
                    return self.evaluate_scheduled_restart() or msg
                # we might not have a valid PostgreSQL connection here if another thread
                # stops PostgreSQL, therefore, we only reload replication slots if no
                # asynchronous processes are running (should be always the case for the master)
                if not self._async_executor.busy and not self.state_handler.is_starting():
                    if not self.state_handler.cb_called:
        except DCSError:
            dcs_failed = True
            logger.error("Error communicating with DCS")
            if not self.is_paused() and self.state_handler.is_running() and self.state_handler.is_leader():
                return "demoted self because DCS is not accessible and i was a leader"
            return "DCS is not accessible"
        except (psycopg2.Error, PostgresConnectionException):
            return "Error communicating with PostgreSQL. Will try again later"
            if not dcs_failed:

    def run_cycle(self):
        with self._async_executor:
            info = self._run_cycle()
            return (self.is_paused() and "PAUSE: " or "") + info

    def watch(self, timeout):
        cluster = self.cluster
        # watch on leader key changes if the postgres is running and leader is known and current node is not lock owner
        if (
            not self._async_executor.busy
            and cluster
            and cluster.leader
            and cluster.leader.name != self.state_handler.name
            leader_index = cluster.leader.index
            leader_index = None

        return self.dcs.watch(leader_index, timeout)

    def wakeup(self):
        """Call of this method will trigger the next run of HA loop if there is
        no "active" leader watch request in progress.
        This usually happens on the master or if the node is running async action"""
class Ha(object):

    def __init__(self, patroni):
        self.patroni = patroni
        self.state_handler = patroni.postgresql
        self.dcs = patroni.dcs
        self.cluster = None
        self.old_cluster = None
        self.recovering = False
        self._async_executor = AsyncExecutor()

    def load_cluster_from_dcs(self):
        cluster = self.dcs.get_cluster()

        # We want to keep the state of cluster when it was healthy
        if not cluster.is_unlocked() or not self.old_cluster:
            self.old_cluster = cluster
        self.cluster = cluster

    def acquire_lock(self):
        return self.dcs.attempt_to_acquire_leader()

    def update_lock(self):
        ret = self.dcs.update_leader()
        if ret and not self._async_executor.busy:
        return ret

    def has_lock(self):
        lock_owner = self.cluster.leader and self.cluster.leader.name
        logger.info('Lock owner: %s; I am %s', lock_owner, self.state_handler.name)
        return lock_owner == self.state_handler.name

    def touch_member(self):
        data = {
            'conn_url': self.state_handler.connection_string,
            'api_url': self.patroni.api.connection_string,
            'state': self.state_handler.state,
            'role': self.state_handler.role
        if self.patroni.tags:
            data['tags'] = self.patroni.tags
        if self.state_handler.pending_restart:
            data['pending_restart'] = True
        if not self._async_executor.busy and data['state'] in ['running', 'restarting', 'starting']:
                data['xlog_location'] = self.state_handler.xlog_position()
        if self.patroni.scheduled_restart:
            scheduled_restart_data = self.patroni.scheduled_restart.copy()
            scheduled_restart_data['schedule'] = scheduled_restart_data['schedule'].isoformat()
            data['scheduled_restart'] = scheduled_restart_data

        self.dcs.touch_member(json.dumps(data, separators=(',', ':')))

    def clone(self, clone_member=None, msg='(without leader)'):
        if self.state_handler.clone(clone_member):
            logger.info('bootstrapped %s', msg)
            cluster = self.dcs.get_cluster()
            node_to_follow = self._get_node_to_follow(cluster)
            self.state_handler.follow(node_to_follow, cluster.leader, True)
            logger.error('failed to bootstrap %s', msg)

    def bootstrap(self):
        if not self.cluster.is_unlocked():  # cluster already has leader
            clone_member = self.cluster.get_clone_member()
            member_role = 'leader' if clone_member == self.cluster.leader else 'replica'
            msg = "from {0} '{1}'".format(member_role, clone_member.name)
            self._async_executor.schedule('bootstrap {0}'.format(msg))
            self._async_executor.run_async(self.clone, args=(clone_member, msg))
            return 'trying to bootstrap {0}'.format(msg)
        # no initialize key and node is allowed to be master and has 'bootstrap' section in a configuration file
        elif self.cluster.initialize is None and not self.patroni.nofailover and 'bootstrap' in self.patroni.config:
            if self.dcs.initialize(create_new=True):  # race for initialization
                    self.dcs.initialize(create_new=False, sysid=self.state_handler.sysid)
                except:  # initdb or start failed
                    # remove initialization key and give a chance to other members
                    logger.info("removing initialize key after failed attempt to initialize the cluster")
                self.dcs.set_config_value(json.dumps(self.patroni.config.dynamic_configuration, separators=(',', ':')))
                return 'initialized a new cluster'
                return 'failed to acquire initialize lock'
            if self.state_handler.can_create_replica_without_replication_connection():
                return "trying to bootstrap (without leader)"
            return 'waiting for leader to bootstrap'

    def recover(self):
        self.recovering = True
        return self.follow("starting as readonly because i had the session lock", "starting as a secondary", True, True)

    def _get_node_to_follow(self, cluster):
        # determine the node to follow. If replicatefrom tag is set,
        # try to follow the node mentioned there, otherwise, follow the leader.
        if not self.patroni.replicatefrom or self.patroni.replicatefrom == self.state_handler.name:
            node_to_follow = cluster.leader
            node_to_follow = cluster.get_member(self.patroni.replicatefrom)

        return node_to_follow if node_to_follow and node_to_follow.name != self.state_handler.name else None

    def follow(self, demote_reason, follow_reason, refresh=True, recovery=False):
        if refresh:

        if recovery:
            ret = demote_reason if self.has_lock() else follow_reason
            ret = demote_reason if self.state_handler.is_leader() else follow_reason

        node_to_follow = self._get_node_to_follow(self.cluster)

        self.state_handler.follow(node_to_follow, self.cluster.leader, recovery, self._async_executor)

        return ret

    def enforce_master_role(self, message, promote_message):
        if self.state_handler.is_leader() or self.state_handler.role == 'master':
            return message
            return promote_message

    def fetch_node_status(member):
        """This function perform http get request on member.api_url and fetches its status
        :returns: tuple(`member`, reachable, in_recovery, xlog_location)

        reachable - `!False` if the node is not reachable or is not responding with correct JSON
        in_recovery - `!True` if pg_is_in_recovery() == true
        xlog_location - value of `replayed_location` or `location` from JSON, dependin on its role.
        tags - dictionary with values of different tags (i.e. nofailover)

            response = requests.get(member.api_url, timeout=2, verify=False)
            logger.info('Got response from %s %s: %s', member.name, member.api_url, response.content)
            json = response.json()
            is_master = json['role'] == 'master'
            xlog_location = None if is_master else json['xlog']['replayed_location']
            return (member, True, not is_master, xlog_location, json.get('tags', {}))
            logger.exception('request failed: GET %s', member.api_url)
        return (member, False, None, 0, {})

    def fetch_nodes_statuses(self, members):
        pool = ThreadPool(len(members))
        results = pool.map(self.fetch_node_status, members)  # Run API calls on members in parallel
        return results

    def _is_healthiest_node(self, members, check_replication_lag=True):
        """This method tries to determine whether I am healthy enough to became a new leader candidate or not."""

        if check_replication_lag and not self.state_handler.check_replication_lag(self.cluster.last_leader_operation):
            return False  # Too far behind last reported xlog location on master

        # Prepare list of nodes to run check against
        members = [m for m in members if m.name != self.state_handler.name and not m.nofailover and m.api_url]

        if members:
            my_xlog_location = self.state_handler.xlog_position()
            for member, reachable, in_recovery, xlog_location, tags in self.fetch_nodes_statuses(members):
                if reachable and not tags.get('nofailover', False):  # If the node is unreachable it's not healhy
                    if not in_recovery:
                        logger.warning('Master (%s) is still alive', member.name)
                        return False
                    if my_xlog_location < xlog_location:
                        return False
        return True

    def is_failover_possible(self, members):
        ret = False
        members = [m for m in members if m.name != self.state_handler.name and not m.nofailover and m.api_url]
        if members:
            for member, reachable, _, _, tags in self.fetch_nodes_statuses(members):
                if reachable and not tags.get('nofailover', False):
                    ret = True  # TODO: check xlog_location
                elif not reachable:
                    logger.info('Member %s is not reachable', member.name)
                elif tags.get('nofailover', False):
                    logger.info('Member %s is not allowed to promote', member.name)
            logger.warning('manual failover: members list is empty')
        return ret

    def manual_failover_process_no_leader(self):
        failover = self.cluster.failover
        if failover.candidate:  # manual failover to specific member
            if failover.candidate == self.state_handler.name:  # manual failover to me
                return True

            # find specific node and check that it is healthy
            member = self.cluster.get_member(failover.candidate, fallback_to_leader=False)
            if member:
                member, reachable, _, _, tags = self.fetch_node_status(member)
                if reachable and not tags.get('nofailover', False):  # node is healthy
                    logger.info('manual failover: to %s, i am %s', member.name, self.state_handler.name)
                    return False
                # we wanted to failover to specific member but it is not healthy
                if not reachable:
                    logger.warning('manual failover: member %s is unhealthy', member.name)
                elif tags.get('nofailover', False):
                    logger.warning('manual failover: member %s is not allowed to promote', member.name)

            # at this point we should consider all members as a candidates for failover
            # i.e. we assume that failover.candidate is None

        # try to pick some other members to failover and check that they are healthy
        if failover.leader:
            if self.state_handler.name == failover.leader:  # I was the leader
                # exclude me and desired member which is unhealthy (failover.candidate can be None)
                members = [m for m in self.cluster.members if m.name not in (failover.candidate, failover.leader)]
                if self.is_failover_possible(members):  # check that there are healthy members
                    return False
                else:  # I was the leader and it looks like currently I am the only healthy member
                    return True

            # at this point we assume that our node is a candidate for a failover among all nodes except former leader

        # exclude former leader from the list (failover.leader can be None)
        members = [m for m in self.cluster.members if m.name != failover.leader]
        return self._is_healthiest_node(members, check_replication_lag=False)

    def is_healthiest_node(self):
        if self.state_handler.is_leader():  # leader is always the healthiest
            return True

        if self.patroni.nofailover:  # nofailover tag makes node always unhealthy
            return False

        if self.cluster.failover:
            return self.manual_failover_process_no_leader()

        # run usual health check
        members = {m.name: m for m in self.cluster.members + self.old_cluster.members}
        return self._is_healthiest_node(members.values())

    def demote(self, delete_leader=True):
        if delete_leader:
            sleep(2)  # Give a time to somebody to promote
            cluster = self.dcs.get_cluster()
            node_to_follow = self._get_node_to_follow(cluster)
            self.state_handler.follow(node_to_follow, cluster.leader, True)
            self.state_handler.follow(None, None)

    def should_run_scheduled_action(self, action_name, scheduled_at, cleanup_fn):
        if scheduled_at:
            # If the scheduled action is in the far future, we shouldn't do anything and just return.
            # If the scheduled action is in the past, we consider the value to be stale and we remove
            # the value.
            # If the value is close to now, we initiate the scheduled action
            # Additionally, if the scheduled action cannot be executed altogether, i.e. there is an error
            # or the action is in the past - we take care of cleaning it up.
            now = datetime.datetime.now(pytz.utc)
                delta = (scheduled_at - now).total_seconds()

                if delta > self.dcs.loop_wait:
                    logger.info('Awaiting %s at %s (in %.0f seconds)',
                                action_name, scheduled_at.isoformat(), delta)
                    return False
                elif delta < - int(self.dcs.loop_wait * 1.5):
                    logger.warning('Found a stale %s value, cleaning up: %s',
                                   action_name, scheduled_at.isoformat())
                    self.dcs.manual_failover('', '', index=self.cluster.failover.index)
                    return False

                # The value is very close to now
                sleep(max(delta, 0))
                logger.info('Manual scheduled {0} at %s'.format(action_name), scheduled_at.isoformat())
                return True
            except TypeError:
                logger.warning('Incorrect value of scheduled_at: %s', scheduled_at)
        return False

    def process_manual_failover_from_leader(self):
        failover = self.cluster.failover

        if (failover.scheduled_at and not
            self.should_run_scheduled_action("failover", failover.scheduled_at, lambda:
                                             self.dcs.manual_failover('', '', index=self.cluster.failover.index))):

        if not failover.leader or failover.leader == self.state_handler.name:
            if not failover.candidate or failover.candidate != self.state_handler.name:
                members = [m for m in self.cluster.members if not failover.candidate or m.name == failover.candidate]
                if self.is_failover_possible(members):  # check that there are healthy members
                    self._async_executor.schedule('manual failover: demote')
                    return 'manual failover: demoting myself'
                    logger.warning('manual failover: no healthy members found, failover is not possible')
                logger.warning('manual failover: I am already the leader, no need to failover')
            logger.warning('manual failover: leader name does not match: %s != %s',
                           self.cluster.failover.leader, self.state_handler.name)

        logger.info('Trying to clean up failover key')
        self.dcs.manual_failover('', '', index=self.cluster.failover.index)

    def process_unhealthy_cluster(self):
        if self.is_healthiest_node():
            if self.acquire_lock():
                if self.cluster.failover:
                    logger.info('Cleaning up failover key after acquiring leader lock...')
                    self.dcs.manual_failover('', '')
                return self.enforce_master_role('acquired session lock as a leader',
                                                'promoted self to leader by acquiring session lock')
                return self.follow('demoted self after trying and failing to obtain lock',
                                   'following new leader after trying and failing to obtain lock')
            if self.patroni.nofailover:
                return self.follow('demoting self because I am not allowed to become master',
                                   'following a different leader because I am not allowed to promote')
            return self.follow('demoting self because i am not the healthiest node',
                               'following a different leader because i am not the healthiest node')

    def process_healthy_cluster(self):
        if self.has_lock():
            if self.cluster.failover:
                msg = self.process_manual_failover_from_leader()
                if msg is not None:
                    return msg

            if self.update_lock():
                return self.enforce_master_role('no action.  i am the leader with the lock',
                                                'promoted self to leader because i had the session lock')
                # Either there is no connection to DCS or someone else acquired the lock
                logger.error('failed to update leader lock')
            logger.info('does not have lock')
        return self.follow('demoting self because i do not have the lock and i was a leader',
                           'no action.  i am a secondary and i am following a leader', False)

    def evaluate_scheduled_restart(self):
        # restart if we need to
        restart_data = self.future_restart_scheduled()
        if restart_data:
            recent_time = self.state_handler.postmaster_start_time()
            request_time = restart_data['postmaster_start_time']
            # check if postmaster start time has changed since the last restart
            if recent_time and request_time and recent_time != request_time:
                logger.info("Cancelling scheduled restart: postgres restart has already happened at %s", recent_time)
                return None

        if (restart_data and
           self.should_run_scheduled_action('restart', restart_data['schedule'], self.delete_future_restart)):
                ret, message = self.restart(restart_data, run_async=True)
                if not ret:
                    logger.warning("Scheduled restart: %s", message)
                    return None
                return message

    def restart_matches(self, role, postgres_version, pending_restart):
        reason_to_cancel = ""
        # checking the restart filters here seem to be less ugly than moving them into the
        # run_scheduled_action.
        if role and role != self.state_handler.role:
            reason_to_cancel = "host role mismatch"

        if (postgres_version and
           self.state_handler.postgres_version_to_int(postgres_version) <= int(self.state_handler.server_version)):
            reason_to_cancel = "postgres version mismatch"

        if pending_restart and not self.state_handler.pending_restart:
            reason_to_cancel = "pending restart flag is not set"

        if not reason_to_cancel:
            return True
            logger.info("not proceeding with the restart: %s", reason_to_cancel)
        return False

    def schedule(self, action, immediate=False):
        with self._async_executor:
            return self._async_executor.schedule(action, immediate)

    def schedule_future_restart(self, restart_data):
        with self._async_executor:
            if not self.patroni.scheduled_restart:
                self.patroni.scheduled_restart = restart_data
                return True
        return False

    def delete_future_restart(self):
        ret = False
        with self._async_executor:
            if self.patroni.scheduled_restart:
                self.patroni.scheduled_restart = {}
                ret = True
        return ret

    def future_restart_scheduled(self):
        return self.patroni.scheduled_restart.copy() if (self.patroni.scheduled_restart and
                                                         isinstance(self.patroni.scheduled_restart, dict)) else None

    def schedule_reinitialize(self):
        return self.schedule('reinitialize')

    def reinitialize_scheduled(self):
        return self._async_executor.scheduled_action == 'reinitialize'

    def schedule_restart(self, immediate=False):
        return self.schedule('restart', immediate)

    def restart_scheduled(self):
        return self._async_executor.scheduled_action == 'restart'

    def restart(self, restart_data=None, run_async=False):
        """ conditional and unconditional restart """
        if (restart_data and isinstance(restart_data, dict) and
            not self.restart_matches(restart_data.get('role'),
                                     ('restart_pending' in restart_data))):
            return (False, "restart conditions are not satisfied")

        with self._async_executor:
            prev = self.schedule_restart(immediate=(not run_async))
            if prev is not None:
                return (False, prev + ' already in progress')
            if not run_async:
                if self._async_executor.run(self.state_handler.restart):
                    return (True, 'restarted successfully')
                    return (False, 'restart failed')
                return (True, "restart initiated")

    def reinitialize(self, cluster):

        clone_member = cluster.get_clone_member()
        member_role = 'leader' if clone_member == cluster.leader else 'replica'
        self.clone(clone_member, "from {0} '{1}'".format(member_role, clone_member.name))

    def process_scheduled_action(self):
        if self.reinitialize_scheduled():
            if self.cluster.is_unlocked():
                logger.error('Cluster has no leader, can not reinitialize')
            elif self.has_lock():
                logger.error('I am the leader, can not reinitialize')
                self._async_executor.run_async(self.reinitialize, args=(self.cluster, ))
                return 'reinitialize started'

    def handle_long_action_in_progress(self):
        if self.has_lock():
            if self.update_lock():
                return 'updated leader lock during ' + self._async_executor.scheduled_action
                return 'failed to update leader lock during ' + self._async_executor.scheduled_action
        elif self.cluster.is_unlocked():
            return 'not healthy enough for leader race'
            return self._async_executor.scheduled_action + ' in progress'

    def sysid_valid(sysid):
        # sysid does tv_sec << 32, where tv_sec is the number of seconds sine 1970,
        # so even 1 << 32 would have 10 digits.
        sysid = str(sysid)
        return len(sysid) >= 10 and sysid.isdigit()

    def post_recover(self):
        if not self.state_handler.is_running():
            if self.has_lock():
                return 'removed leader key after trying and failing to start postgres'
            return 'failed to start postgres'
        return None

    def _run_cycle(self):


            # cluster has leader key but not initialize key
            if not (self.cluster.is_unlocked() or self.sysid_valid(self.cluster.initialize)) and self.has_lock():
                self.dcs.initialize(create_new=(self.cluster.initialize is None), sysid=self.state_handler.sysid)

            if not (self.cluster.is_unlocked() or self.cluster.config and self.cluster.config.data) and self.has_lock():
                self.dcs.set_config_value(json.dumps(self.patroni.config.dynamic_configuration, separators=(',', ':')))

            if self._async_executor.busy:
                return self.handle_long_action_in_progress()

            # we've got here, so any async action has finished. Check if we tried to recover and failed
            if self.recovering:
                self.recovering = False
                msg = self.post_recover()
                if msg is not None:
                    return msg

            # currently it can trigger only reinitialize
            msg = self.process_scheduled_action()
            if msg is not None:
                return msg

            # is data directory empty?
            if self.state_handler.data_directory_empty():
                return self.bootstrap()  # new node
            # "bootstrap", but data directory is not empty
            elif not self.sysid_valid(self.cluster.initialize) and self.cluster.is_unlocked():
                self.dcs.initialize(create_new=(self.cluster.initialize is None), sysid=self.state_handler.sysid)
                # check if we are allowed to join
                if self.sysid_valid(self.cluster.initialize) and self.cluster.initialize != self.state_handler.sysid:
                    logger.fatal("system ID mismatch, node %s belongs to a different cluster: %s != %s",
                                 self.state_handler.name, self.cluster.initialize, self.state_handler.sysid)

            # try to start dead postgres
            if not self.state_handler.is_healthy():
                msg = self.recover()
                if msg is not None:
                    return msg

                if self.cluster.is_unlocked():
                    return self.process_unhealthy_cluster()
                    msg = self.evaluate_scheduled_restart()
                    if msg is not None:
                        return msg
                    return self.process_healthy_cluster()
                # we might not have a valid PostgreSQL connection here if another thread
                # stops PostgreSQL, therefore, we only reload replication slots if no
                # asynchronous processes are running (should be always the case for the master)
                if not self._async_executor.busy:
        except DCSError:
            logger.error('Error communicating with DCS')
            if self.state_handler.is_running() and self.state_handler.is_leader():
                return 'demoted self because DCS is not accessible and i was a leader'
        except (psycopg2.Error, PostgresConnectionException):
            logger.exception('Error communicating with PostgreSQL. Will try again later')

    def run_cycle(self):
        with self._async_executor:
            return self._run_cycle()