class Patroni: def __init__(self, config): self.nap_time = config['loop_wait'] self.postgresql = Postgresql(config['postgresql']) self.ha = Ha(self.postgresql, self.get_dcs(self.postgresql.name, config)) host, port = config['restapi']['listen'].split(':') self.api = RestApiServer(self, config['restapi']) self.next_run = time.time() self.shutdown_member_ttl = 300 @staticmethod def get_dcs(name, config): if 'etcd' in config: return Etcd(name, config['etcd']) if 'zookeeper' in config: return ZooKeeper(name, config['zookeeper']) raise Exception('Can not find sutable configuration of distributed configuration store') def touch_member(self, ttl=None): connection_string = self.postgresql.connection_string + '?application_name=' + self.api.connection_string if self.ha.cluster: for m in self.ha.cluster.members: # Do not update member TTL when it is far from being expired if m.name == self.postgresql.name and m.real_ttl() > self.shutdown_member_ttl: return True return self.ha.dcs.touch_member(connection_string, ttl) def cleanup_on_failed_initialization(self): """ cleanup the DCS if initialization was not successfull """ logger.info("removing initialize key after failed attempt to initialize the cluster") self.ha.dcs.cancel_initialization() self.touch_member(self.shutdown_member_ttl) self.postgresql.stop() self.postgresql.move_data_directory() def initialize(self): # wait for etcd to be available while not self.touch_member(): logger.info('waiting on DCS') sleep(5) # is data directory empty? if self.postgresql.data_directory_empty(): while True: try: cluster = self.ha.dcs.get_cluster() if not cluster.is_unlocked(): # the leader already exists if not cluster.initialize: self.ha.dcs.initialize() self.postgresql.bootstrap(cluster.leader) break # racing to initialize elif not cluster.initialize and self.ha.dcs.initialize(): try: self.postgresql.bootstrap() except: # bail out and clean the initialize flag. self.cleanup_on_failed_initialization() raise self.ha.dcs.take_leader() break except DCSError: logger.info('waiting on DCS') sleep(5) elif self.postgresql.is_running(): self.postgresql.load_replication_slots() def schedule_next_run(self): if self.postgresql.is_promoted: self.next_run = time.time() self.next_run += self.nap_time current_time = time.time() nap_time = self.next_run - current_time if nap_time <= 0: self.next_run = current_time else: self.ha.dcs.watch(nap_time) def run(self): self.api.start() self.next_run = time.time() while True: self.touch_member() logger.info(self.ha.run_cycle()) try: if self.ha.state_handler.is_leader(): self.ha.cluster and self.ha.state_handler.create_replication_slots(self.ha.cluster) else: self.ha.state_handler.drop_replication_slots() except: logger.exception('Exception when changing replication slots') reap_children() self.schedule_next_run()