Example #1
0
class Patroni:

    def __init__(self, config):
        self.nap_time = config['loop_wait']
        self.postgresql = Postgresql(config['postgresql'])
        self.ha = Ha(self.postgresql, self.get_dcs(self.postgresql.name, config))
        host, port = config['restapi']['listen'].split(':')
        self.api = RestApiServer(self, config['restapi'])
        self.next_run = time.time()
        self.shutdown_member_ttl = 300

    @staticmethod
    def get_dcs(name, config):
        if 'etcd' in config:
            return Etcd(name, config['etcd'])
        if 'zookeeper' in config:
            return ZooKeeper(name, config['zookeeper'])
        raise Exception('Can not find sutable configuration of distributed configuration store')

    def touch_member(self, ttl=None):
        connection_string = self.postgresql.connection_string + '?application_name=' + self.api.connection_string
        if self.ha.cluster:
            for m in self.ha.cluster.members:
                # Do not update member TTL when it is far from being expired
                if m.name == self.postgresql.name and m.real_ttl() > self.shutdown_member_ttl:
                    return True
        return self.ha.dcs.touch_member(connection_string, ttl)

    def cleanup_on_failed_initialization(self):
        """ cleanup the DCS if initialization was not successfull """
        logger.info("removing initialize key after failed attempt to initialize the cluster")
        self.ha.dcs.cancel_initialization()
        self.touch_member(self.shutdown_member_ttl)
        self.postgresql.stop()
        self.postgresql.move_data_directory()

    def initialize(self):
        # wait for etcd to be available
        while not self.touch_member():
            logger.info('waiting on DCS')
            sleep(5)

        # is data directory empty?
        if self.postgresql.data_directory_empty():
            while True:
                try:
                    cluster = self.ha.dcs.get_cluster()
                    if not cluster.is_unlocked():  # the leader already exists
                        if not cluster.initialize:
                            self.ha.dcs.initialize()
                        self.postgresql.bootstrap(cluster.leader)
                        break
                    # racing to initialize
                    elif not cluster.initialize and self.ha.dcs.initialize():
                        try:
                            self.postgresql.bootstrap()
                        except:
                            # bail out and clean the initialize flag.
                            self.cleanup_on_failed_initialization()
                            raise
                        self.ha.dcs.take_leader()
                        break
                except DCSError:
                    logger.info('waiting on DCS')
                sleep(5)
        elif self.postgresql.is_running():
            self.postgresql.load_replication_slots()

    def schedule_next_run(self):
        if self.postgresql.is_promoted:
            self.next_run = time.time()
        self.next_run += self.nap_time
        current_time = time.time()
        nap_time = self.next_run - current_time
        if nap_time <= 0:
            self.next_run = current_time
        else:
            self.ha.dcs.watch(nap_time)

    def run(self):
        self.api.start()
        self.next_run = time.time()

        while True:
            self.touch_member()
            logger.info(self.ha.run_cycle())
            try:
                if self.ha.state_handler.is_leader():
                    self.ha.cluster and self.ha.state_handler.create_replication_slots(self.ha.cluster)
                else:
                    self.ha.state_handler.drop_replication_slots()
            except:
                logger.exception('Exception when changing replication slots')
            reap_children()
            self.schedule_next_run()