class TestEtcd(unittest.TestCase): def __init__(self, method_name='runTest'): self.setUp = self.set_up super(TestEtcd, self).__init__(method_name) def set_up(self): time.sleep = time_sleep with patch.object(Client, 'machines') as mock_machines: mock_machines.__get__ = Mock(return_value=[ 'http://localhost:2379', 'http://localhost:4001' ]) self.etcd = Etcd('foo', { 'ttl': 30, 'host': 'localhost:2379', 'scope': 'test' }) self.etcd.client.write = etcd_write self.etcd.client.read = etcd_read def test_get_etcd_client(self): time.sleep = time_sleep_exception with patch.object(etcd.Client, 'machines') as mock_machines: mock_machines.__get__ = Mock(side_effect=etcd.EtcdException) self.assertRaises(Exception, self.etcd.get_etcd_client, {'discovery_srv': 'test'}) def test_get_cluster(self): self.assertIsInstance(self.etcd.get_cluster(), Cluster) self.etcd._base_path = '/service/nocluster' cluster = self.etcd.get_cluster() self.assertIsInstance(cluster, Cluster) self.assertIsNone(cluster.leader) def test_current_leader(self): self.assertIsInstance(self.etcd.current_leader(), Member) self.etcd._base_path = '/service/noleader' self.assertIsNone(self.etcd.current_leader()) def test_touch_member(self): self.assertFalse(self.etcd.touch_member('', '')) def test_take_leader(self): self.assertFalse(self.etcd.take_leader()) def test_update_leader(self): self.assertTrue(self.etcd.update_leader(MockPostgresql())) def test_race(self): self.assertFalse(self.etcd.race('')) def test_delete_leader(self): self.etcd.client.delete = etcd_delete self.assertFalse(self.etcd.delete_leader())
class TestEtcd(unittest.TestCase): def __init__(self, method_name='runTest'): self.setUp = self.set_up super(TestEtcd, self).__init__(method_name) def set_up(self): time.sleep = time_sleep with patch.object(Client, 'machines') as mock_machines: mock_machines.__get__ = Mock(return_value=['http://localhost:2379', 'http://localhost:4001']) self.etcd = Etcd('foo', {'ttl': 30, 'host': 'localhost:2379', 'scope': 'test'}) self.etcd.client.write = etcd_write self.etcd.client.read = etcd_read def test_get_etcd_client(self): time.sleep = time_sleep_exception with patch.object(etcd.Client, 'machines') as mock_machines: mock_machines.__get__ = Mock(side_effect=etcd.EtcdException) self.assertRaises(Exception, self.etcd.get_etcd_client, {'discovery_srv': 'test'}) def test_get_cluster(self): self.assertIsInstance(self.etcd.get_cluster(), Cluster) self.etcd._base_path = '/service/nocluster' cluster = self.etcd.get_cluster() self.assertIsInstance(cluster, Cluster) self.assertIsNone(cluster.leader) def test_current_leader(self): self.assertIsInstance(self.etcd.current_leader(), Member) self.etcd._base_path = '/service/noleader' self.assertIsNone(self.etcd.current_leader()) def test_touch_member(self): self.assertFalse(self.etcd.touch_member('', '')) def test_take_leader(self): self.assertFalse(self.etcd.take_leader()) def test_update_leader(self): self.assertTrue(self.etcd.update_leader(MockPostgresql())) def test_race(self): self.assertFalse(self.etcd.race('')) def test_delete_leader(self): self.etcd.client.delete = etcd_delete self.assertFalse(self.etcd.delete_leader())
# racing to initialize wait_for_etcd("cannot initialize member without ETCD") if etcd.race("/initialize", postgresql.name): logging.info("Governor Starting up: Initialisation Race ... WON!!!") logging.info("Governor Starting up: Initialise Postgres") postgresql.initialize() logging.info("Governor Starting up: Initialise Complete") etcd.take_leader(postgresql.name) logging.info("Governor Starting up: Starting Postgres") postgresql.start() else: logging.info("Governor Starting up: Initialisation Race ... LOST") logging.info("Governor Starting up: Sync Postgres from Leader") synced_from_leader = False while not synced_from_leader: leader = etcd.current_leader() if not leader: time.sleep(5) continue if postgresql.sync_from_leader(leader): logging.info("Governor Starting up: Sync Completed") postgresql.write_recovery_conf(leader) logging.info("Governor Starting up: Starting Postgres") postgresql.start() synced_from_leader = True else: time.sleep(5) else: logging.info("Governor Starting up: Existing Data Dir") postgresql.follow_no_leader() logging.info("Governor Starting up: Starting Postgres")
def run(config): etcd = Etcd(config["etcd"]) postgresql = Postgresql(config["postgresql"]) ha = Ha(postgresql, etcd) atexit.register(stop_postgresql, postgresql) logging.info("Governor Starting up") # is data directory empty? if postgresql.data_directory_empty(): logging.info("Governor Starting up: Empty Data Dir") # racing to initialize wait_for_etcd("cannot initialize member without ETCD", etcd, postgresql) if etcd.race("/initialize", postgresql.name) or not etcd.members(): logging.info( "Governor Starting up: Initialisation Race ... WON!!!") logging.info("Governor Starting up: Initialise Postgres") postgresql.initialize() logging.info("Governor Starting up: Initialise Complete") etcd.take_leader(postgresql.name) logging.info("Governor Starting up: Starting Postgres") postgresql.start() else: logging.info("Governor Starting up: Initialisation Race ... LOST") logging.info("Governor Starting up: Sync Postgres from Leader") synced_from_leader = False while not synced_from_leader: leader = etcd.current_leader() if not leader: time.sleep(5) continue if postgresql.sync_from_leader(leader): logging.info("Governor Starting up: Sync Completed") postgresql.write_recovery_conf(leader) logging.info("Governor Starting up: Starting Postgres") postgresql.start() synced_from_leader = True else: time.sleep(5) else: logging.info("Governor Starting up: Existing Data Dir") postgresql.follow_no_leader() logging.info("Governor Starting up: Starting Postgres") postgresql.start() wait_for_etcd( "running in readonly mode; cannot participate in cluster HA without etcd", etcd, postgresql) logging.info("Governor Running: Starting Running Loop") while True: try: ha.run_cycle() # create replication slots if postgresql.is_leader(): logging.info("Governor Running: I am the Leader") for node in etcd.get_client_path( "/members?recursive=true").get("node", {}).get("nodes", []): member = node["key"].split('/')[-1] if member != postgresql.name: postgresql.query( "DO LANGUAGE plpgsql $$DECLARE somevar VARCHAR; BEGIN SELECT slot_name INTO somevar FROM pg_replication_slots WHERE slot_name = '%(slot)s' LIMIT 1; IF NOT FOUND THEN PERFORM pg_create_physical_replication_slot('%(slot)s'); END IF; END$$;" % {"slot": member}) etcd.touch_member(postgresql.name, postgresql.connection_string) time.sleep(config["loop_wait"]) except urllib2.URLError: logging.info( "Lost connection to etcd, setting no leader and waiting on etcd" ) postgresql.follow_no_leader() wait_for_etcd( "running in readonly mode; cannot participate in cluster HA without etcd", etcd, postgresql)
def run(config): etcd = Etcd(config["etcd"]) postgresql = Postgresql(config["postgresql"]) ha = Ha(postgresql, etcd) atexit.register(stop_postgresql, postgresql) logging.info("Governor Starting up") # is data directory empty? if postgresql.data_directory_empty(): logging.info("Governor Starting up: Empty Data Dir") # racing to initialize wait_for_etcd("cannot initialize member without ETCD", etcd, postgresql) if etcd.race("/initialize", postgresql.name): logging.info("Governor Starting up: Initialisation Race ... WON!!!") logging.info("Governor Starting up: Initialise Postgres") postgresql.initialize() logging.info("Governor Starting up: Initialise Complete") etcd.take_leader(postgresql.name) logging.info("Governor Starting up: Starting Postgres") postgresql.start() else: logging.info("Governor Starting up: Initialisation Race ... LOST") logging.info("Governor Starting up: Sync Postgres from Leader") synced_from_leader = False while not synced_from_leader: leader = etcd.current_leader() if not leader: time.sleep(5) continue if postgresql.sync_from_leader(leader): logging.info("Governor Starting up: Sync Completed") postgresql.write_recovery_conf(leader) logging.info("Governor Starting up: Starting Postgres") postgresql.start() synced_from_leader = True else: time.sleep(5) else: logging.info("Governor Starting up: Existing Data Dir") postgresql.follow_no_leader() logging.info("Governor Starting up: Starting Postgres") postgresql.start() wait_for_etcd("running in readonly mode; cannot participate in cluster HA without etcd", etcd, postgresql) logging.info("Governor Running: Starting Running Loop") while True: try: logging.info("Governor Running: %s" % ha.run_cycle()) # create replication slots if postgresql.is_leader(): logging.info("Governor Running: I am the Leader") for node in etcd.get_client_path("/members?recursive=true")["node"]["nodes"]: member = node["key"].split('/')[-1] if member != postgresql.name: postgresql.query("DO LANGUAGE plpgsql $$DECLARE somevar VARCHAR; BEGIN SELECT slot_name INTO somevar FROM pg_replication_slots WHERE slot_name = '%(slot)s' LIMIT 1; IF NOT FOUND THEN PERFORM pg_create_physical_replication_slot('%(slot)s'); END IF; END$$;" % {"slot": member}) etcd.touch_member(postgresql.name, postgresql.connection_string) time.sleep(config["loop_wait"]) except urllib2.URLError: logging.info("Lost connection to etcd, setting no leader and waiting on etcd") postgresql.follow_no_leader() wait_for_etcd("running in readonly mode; cannot participate in cluster HA without etcd", etcd, postgresql)
shutil.rmtree(file_path) except Exception, e: raise e # make lock file def mk_lock_file(lock): f = open(lock, 'w') f.write('') f.close() # main lock_file = "/tmp/wd.lck" try: # determine that we are slave if not etcd.current_leader()["hostname"] == gethostname().split('.')[0]: if not os.path.isfile(lock_file): mk_lock_file(lock_file) # determine if reciver is not running receiver_checker_status = receiver_checker() if receiver_checker_status == False or receiver_checker_status == None: # stop governor cleanup the data dir and start the governor err_msg = "slave is out of sync on %s, re-initilizing" % (hostname) syslog.syslog(err_msg) sns.publish(err_msg) # re-initilize subprocess.call(governor_stop_cmd) # backup the file before blowing away the data dir try: os.unlink(archive_file)
def run(config): etcd = Etcd(config["etcd"]) postgresql = Postgresql(config["postgresql"]) ha = Ha(postgresql, etcd) atexit.register(stop_postgresql, postgresql) signal.signal(signal.SIGTERM, signalhandler) logging.info("Governor Starting up") # is data directory empty? if postgresql.data_directory_empty(): logging.info("Governor Starting up: Empty Data Dir") # racing to initialize wait_for_etcd("cannot initialize member without ETCD", etcd, postgresql) if etcd.race("/initialize", postgresql.name): logging.info("Governor Starting up: Initialisation Race ... WON!!!") logging.info("Governor Starting up: Initialise Postgres") postgresql.initialize() logging.info("Governor Starting up: Initialise Complete") etcd.take_leader(postgresql.name) logging.info("Governor Starting up: Starting Postgres") postgresql.start() else: logging.info("Governor Starting up: Initialisation Race ... LOST") logging.info("Governor Starting up: Sync Postgres from Leader") synced_from_leader = False while not synced_from_leader: leader = etcd.current_leader() if not leader: time.sleep(5) continue if postgresql.sync_from_leader(leader): logging.info("Governor Starting up: Sync Completed") postgresql.write_recovery_conf(leader) logging.info("Governor Starting up: Starting Postgres") postgresql.start() synced_from_leader = True else: time.sleep(5) else: logging.info("Governor Starting up: Existing Data Dir") postgresql.follow_no_leader() logging.info("Governor Starting up: Starting Postgres") postgresql.start() wait_for_etcd("running in readonly mode; cannot participate in cluster HA without etcd", etcd, postgresql) logging.info("Governor Running: Starting Running Loop") while True: try: logging.info("Governor Running: %s" % ha.run_cycle()) # create replication slots if postgresql.is_leader(): logging.info("Governor Running: I am the Leader") for node in etcd.members(): member = node["hostname"] if member != postgresql.name: if postgresql.is_leader(): postgresql.ensure_replication_slot( postgresql.replication_slot_name(member) ) else: postgresql.drop_replication_slot( postgresql.replication_slot_name(member) ) etcd.touch_member(postgresql.name, postgresql.connection_string) time.sleep(config["loop_wait"]) except (urllib2.URLError, socket.timeout): logging.info("Lost connection to etcd, setting no leader and waiting on etcd") postgresql.follow_no_leader() wait_for_etcd("running in readonly mode; cannot participate in cluster HA without etcd", etcd, postgresql)
class Governor: def __init__(self, config): assert config["etcd"]["ttl"] > 2 * config["loop_wait"] self.nap_time = config['loop_wait'] self.etcd = Etcd(config['etcd']) self.aws = AWSConnection(config) self.postgresql = Postgresql(config['postgresql'], self.aws.on_role_change) self.ha = Ha(self.postgresql, self.etcd) host, port = config['restapi']['listen'].split(':') self.api = RestApiServer(self, config['restapi']) self.next_run = time.time() def touch_member(self, ttl=None): connection_string = self.postgresql.connection_string + '?application_name=' + self.api.connection_string return self.etcd.touch_member(self.postgresql.name, connection_string, ttl) def initialize(self): # FIXME: isn't there a better way testing if etcd is writable? # wait for etcd to be available while not self.touch_member(): logging.info('waiting on etcd') sleep(5) # is data directory empty? if self.postgresql.data_directory_empty(): # racing to initialize if self.etcd.race('/initialize', self.postgresql.name): self.postgresql.initialize() self.etcd.take_leader(self.postgresql.name) self.postgresql.start() else: # FIXME: touch_member? while True: leader = self.etcd.current_leader() if leader and self.postgresql.sync_from_leader(leader): self.postgresql.write_recovery_conf(leader) self.postgresql.start() break sleep(5) elif self.postgresql.is_running(): self.postgresql.load_replication_slots() def schedule_next_run(self): self.next_run += self.nap_time current_time = time.time() nap_time = self.next_run - current_time if nap_time <= 0: self.next_run = current_time else: sleep(nap_time) def run(self): self.api.start() self.next_run = time.time() while True: self.touch_member() logging.info(self.ha.run_cycle()) self.schedule_next_run()
except urllib2.URLError: logging.info("waiting on etcd") time.sleep(5) # is data directory empty? if postgresql.data_directory_empty(): # racing to initialize if etcd.race("/initialize", postgresql.name): postgresql.initialize() etcd.take_leader(postgresql.name) postgresql.start() postgresql.create_replication_user() else: synced_from_leader = False while not synced_from_leader: leader = etcd.current_leader() if not leader: time.sleep(5) continue if postgresql.sync_from_leader(leader): postgresql.write_recovery_conf(leader) postgresql.start() synced_from_leader = True else: time.sleep(5) else: postgresql.write_recovery_conf({"address": "postgres://169.0.0.1:5432"}) postgresql.start() while True: logging.info(ha.run_cycle())