def setUp(self): self.pglookout = PgLookout("pglookout.json") self.pglookout.execute_external_command = Mock() self.pglookout.create_alert_file = Mock() self.pglookout.check_for_maintenance_mode_file = Mock() self.pglookout.check_for_maintenance_mode_file.return_value = False self.temp_dir = tempfile.mkdtemp(prefix="pglookout_test_") self.state_file_path = os.path.join(self.temp_dir, "state_file")
def pgl(): pgl_ = PgLookout("pglookout.json") pgl_.config["remote_conns"] = {} pgl_.check_for_maintenance_mode_file = Mock() pgl_.check_for_maintenance_mode_file.return_value = False pgl_.cluster_monitor._connect_to_db = Mock() # pylint: disable=protected-access pgl_.create_alert_file = Mock() pgl_.execute_external_command = Mock() try: yield pgl_ finally: pgl_.quit()
class TestPgLookout(TestCase): def setUp(self): self.pglookout = PgLookout("pglookout.json") self.pglookout.execute_external_command = Mock() self.pglookout.create_alert_file = Mock() self.pglookout.check_for_maintenance_mode_file = Mock() self.pglookout.check_for_maintenance_mode_file.return_value = False self.temp_dir = tempfile.mkdtemp(prefix="pglookout_test_") self.state_file_path = os.path.join(self.temp_dir, "state_file") def tearDown(self): if os.path.exists(self.temp_dir) and self.temp_dir.startswith("/tmp/pglookout_test_"): shutil.rmtree(self.temp_dir) def test_state_file_write(self): self.pglookout.config['json_state_file_path'] = self.state_file_path self.pglookout.write_cluster_state_to_json_file() self.assertTrue(os.path.exists(self.state_file_path)) self.assertTrue(os.path.getsize(self.state_file_path), 2) os.unlink(self.state_file_path) def test_load_config(self): self.pglookout.own_db = "old_value" self.pglookout.load_config() self.assertEqual(self.pglookout.own_db, "1.2.3.4") def _add_to_observer_state(self, observer_name, db_name, pg_last_xlog_receive_location=None, pg_is_in_recovery=True, connection=True, replication_time_lag=None, fetch_time=None, db_time=None): db_node_state = _create_db_node_state(pg_last_xlog_receive_location, pg_is_in_recovery, connection, replication_time_lag, fetch_time=fetch_time, db_time=db_time) update_dict = {"fetch_time": get_iso_timestamp(), "connection": True, db_name: db_node_state} if observer_name in self.pglookout.observer_state: self.pglookout.observer_state[observer_name].update(update_dict) else: self.pglookout.observer_state[observer_name] = update_dict def _add_db_to_cluster_state(self, instance, pg_last_xlog_receive_location=None, pg_is_in_recovery=True, connection=True, replication_time_lag=None, fetch_time=None, db_time=None, conn_info=None): db_node_state = _create_db_node_state(pg_last_xlog_receive_location, pg_is_in_recovery, connection, replication_time_lag, fetch_time=fetch_time, db_time=db_time) self.pglookout.cluster_state[instance] = db_node_state self.pglookout.config["remote_conns"][instance] = conn_info or {"host": instance} def test_check_cluster_state_warning(self): self._add_db_to_cluster_state("kuu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=40.0) self.pglookout.own_db = "kuu" self.pglookout.over_warning_limit_command = "fake_command" self.pglookout.execute_external_command.return_value = 0 self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 1) self.assertEqual(self.pglookout.create_alert_file.call_count, 1) self.pglookout.check_cluster_state() # call count does not change when we have sent a single warning self.assertEqual(self.pglookout.execute_external_command.call_count, 1) self.assertTrue(self.pglookout.replication_lag_over_warning_limit) self.assertEqual(self.pglookout.create_alert_file.call_count, 1) # and then the replication catches up self._add_db_to_cluster_state("kuu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=5.0) self.pglookout.check_cluster_state() self.assertFalse(os.path.exists("replication_delay_warning")) self.assertFalse(self.pglookout.replication_lag_over_warning_limit) def test_check_cluster_do_failover_one_slave(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_db_to_cluster_state("own_db", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "own_db" self.pglookout.execute_external_command.return_value = 0 self.pglookout.replication_lag_over_warning_limit = False self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 1) self.assertFalse(self.pglookout.replication_lag_over_warning_limit) def test_check_cluster_do_failover_one_slave_one_observer(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_db_to_cluster_state("own_db", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "own_db" self._add_to_observer_state("observer", "old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_to_observer_state("observer", "own_db", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.execute_external_command.return_value = 0 self.pglookout.replication_lag_over_warning_limit = False self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 1) self.assertFalse(self.pglookout.replication_lag_over_warning_limit) def test_check_cluster_do_failover_with_a_node_which_is_is_maintenance(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_db_to_cluster_state("kuu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.never_promote_these_nodes = [] self.pglookout.own_db = "kuu" self.pglookout.execute_external_command.return_value = 0 self.pglookout.replication_lag_over_warning_limit = True self.pglookout.check_for_maintenance_mode_file.return_value = True self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit) self.assertEqual(self.pglookout.check_for_maintenance_mode_file.call_count, 1) def test_check_cluster_do_failover_with_a_node_which_should_never_be_promoted(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False) self._add_db_to_cluster_state("kuu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.never_promote_these_nodes = ["kuu"] self.pglookout.own_db = "kuu" self.pglookout.execute_external_command.return_value = 0 self.pglookout.replication_lag_over_warning_limit = True self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit) def test_check_cluster_do_failover_two_slaves(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False) self._add_db_to_cluster_state("kuu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "kuu" # we put the second slave _WELL_ ahead self._add_db_to_cluster_state("puu", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.execute_external_command.return_value = 0 self.pglookout.replication_lag_over_warning_limit = True self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit) # we keep the warning on def test_check_cluster_do_failover_two_slaves_when_the_one_ahead_can_never_be_promoted(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_db_to_cluster_state("kuu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "kuu" # we put the second slave _WELL_ ahead self._add_db_to_cluster_state("puu", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.never_promote_these_nodes = ["puu"] self.pglookout.execute_external_command.return_value = 0 self.pglookout.replication_lag_over_warning_limit = True self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 1) self.assertFalse(self.pglookout.replication_lag_over_warning_limit) def test_failover_with_no_master_anymore(self): # this should not trigger an immediate failover as we have two # standbys online but we've never seen a master so we wait a while # and see what happens self.pglookout.own_db = "kuu" self._add_db_to_cluster_state("kuu", pg_last_xlog_receive_location="F/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=0) self._add_db_to_cluster_state("puu", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=1) self.pglookout.execute_external_command.return_value = 0 self.pglookout.check_cluster_state() assert self.pglookout.execute_external_command.call_count == 0 # now we add a fake "current" master indicating that the cluster has # been consistent at some point, this should trigger an immediate # failover self.pglookout.current_master = "something obsolete" self.pglookout.check_cluster_state() # No failover yet since we're not over missing_master_from_config_timeout assert self.pglookout.execute_external_command.call_count == 0 self.pglookout.cluster_nodes_change_time = time.time() - self.pglookout.missing_master_from_config_timeout self.pglookout.current_master = "something obsolete" self.pglookout.check_cluster_state() assert self.pglookout.execute_external_command.call_count == 1 def test_failover_with_no_master_timeout(self): # this should not trigger an immediate failover as we have two # standbys online but we've never seen a master so we wait a while # and see what happens self.pglookout.own_db = "kuu" self._add_db_to_cluster_state("kuu", pg_last_xlog_receive_location="F/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=0) self._add_db_to_cluster_state("puu", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=1) self.pglookout.execute_external_command.return_value = 0 self.pglookout.check_cluster_state() assert self.pglookout.execute_external_command.call_count == 0 # indicate that we haven't seen configuration changes for 5 minutes, # that should trigger a failover as the timeout has passed self.pglookout.cluster_nodes_change_time = time.time() - 300 self.pglookout.check_cluster_state() assert self.pglookout.execute_external_command.call_count == 1 def test_failover_over_replication_lag_when_still_connected_to_master(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False) # We will make our own node to be the furthest along so we get considered for promotion self._add_db_to_cluster_state("kuu", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "kuu" self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit) # we keep the warning on def test_failover_over_replication_lag_with_one_observer_one_slave_no_connections(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False) # We will make our own node to be the furthest along so we get considered for promotion self._add_db_to_cluster_state("own_db", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "own_db" self._add_to_observer_state("observer", "old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_to_observer_state("observer", "own_db", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=False, replication_time_lag=130.0) self.pglookout.observer_state["observer"]['connection'] = False self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit) # we keep the warning on def test_cluster_state_when_observer_has_also_non_members_of_our_current_cluster(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=True) # We will make our own node to be the furthest along so we get considered for promotion self._add_db_to_cluster_state("own_db", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "own_db" self._add_to_observer_state("observer", "old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_to_observer_state("observer", "own_db", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=False, replication_time_lag=130.0) self._add_to_observer_state("observer", "some_other_cluster", pg_last_xlog_receive_location="3/aaaaaaaa", pg_is_in_recovery=False, connection=True, replication_time_lag=0.0) self.pglookout.check_cluster_state() self.assertEqual(len(self.pglookout.connected_master_nodes), 1) assert 'old_master' in self.pglookout.connected_master_nodes def test_failover_no_connections(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False) # We will make our own node to be the furthest along so we get considered for promotion self._add_db_to_cluster_state("kuu", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "kuu" # we put the second slave _WELL_ ahead self._add_db_to_cluster_state("puu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=False, replication_time_lag=130.0) self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit) # we keep the warning on def test_failover_master_two_slaves_one_observer_no_connection_between_slaves(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) # We will make our own node to be the furthest along so we get considered for promotion self._add_db_to_cluster_state("own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "own" self._add_db_to_cluster_state("other", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=False, replication_time_lag=130.0) # Add observer state self._add_to_observer_state("observer", "old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_to_observer_state("observer", "other", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self._add_to_observer_state("observer", "own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.execute_external_command.return_value = 0 self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 1) self.assertFalse(self.pglookout.replication_lag_over_warning_limit) # we keep the warning on def test_failover_master_one_slave_one_observer_no_connections(self): self.pglookout.own_db = "own" # Add observer state self._add_to_observer_state("observer", "old_master", pg_is_in_recovery=False, connection=True) # add db state self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=True) self._add_db_to_cluster_state("own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=40.0) self.pglookout.check_cluster_state() self.assertTrue(self.pglookout.replication_lag_over_warning_limit) # we keep the warning on self.assertEqual(self.pglookout.execute_external_command.call_count, 0) # Add observer state self._add_to_observer_state("observer", "old_master", pg_is_in_recovery=False, connection=True) self._add_to_observer_state("observer", "own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=9.0) self._add_db_to_cluster_state("own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=140.0) self.pglookout.check_cluster_state() # No failover yet self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit) # we keep the warning on # observer state self._add_to_observer_state("observer", "old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_to_observer_state("observer", "own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=False, replication_time_lag=140.0) # lose own connection to master self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) # now do failover self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 1) def test_find_current_master(self): self._add_db_to_cluster_state("master", pg_is_in_recovery=False, connection=True) # We will make our own node to be the furthest along so we get considered for promotion self._add_db_to_cluster_state("own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=0.1) self.pglookout.own_db = "master" self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.current_master, "master") def test_two_slave_failover_and_autofollow(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False, fetch_time=datetime.datetime(year=2014, month=1, day=1)) # We will make our own node to be the furthest from master so we don't get considered for promotion self._add_db_to_cluster_state("own", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=False, replication_time_lag=130.0) self.pglookout.own_db = "own" self._add_db_to_cluster_state("other", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=False, replication_time_lag=130.0) self.pglookout.check_cluster_state() self.assertTrue(self.pglookout.replication_lag_over_warning_limit) # we keep the warning on self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertEqual(self.pglookout.current_master, "old_master") self._add_db_to_cluster_state("other", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=False, connection=True, replication_time_lag=0.0, conn_info={"host": "otherhost.example.com", "port": 11111}) pg_data_dir = os.path.join(self.temp_dir + os.sep + "test_pgdata") os.makedirs(pg_data_dir) primary_conninfo = "user=replication password=vjsh8l7sv4a902y1tsdz host=old_master port=5432 sslmode=prefer sslcompression=1 krbsrvname=postgres" old_recovery_conf = "standby_mode = 'on'\nprimary_conninfo = '{0}'\n".format(primary_conninfo) with open(os.path.join(pg_data_dir, "recovery.conf"), "w") as fp: fp.write(old_recovery_conf) self.pglookout.config['pg_data_directory'] = pg_data_dir self.pglookout.config['autofollow'] = True self.pglookout.primary_conninfo_template = get_connection_info(primary_conninfo) self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.current_master, "other") with open(os.path.join(pg_data_dir, "recovery.conf"), "r") as fp: new_lines = fp.read().splitlines() assert new_lines.pop(0).startswith("# pglookout updated primary_conninfo") assert new_lines.pop(0) == "standby_mode = 'on'" assert new_lines[0].startswith("primary_conninfo = ") new_primary_conninfo = new_lines.pop(0) assert new_lines.pop(0) == "recovery_target_timeline = 'latest'" assert new_lines == [] old_conn_info = get_connection_info(primary_conninfo) new_conn_info = get_connection_info_from_config_line(new_primary_conninfo) assert new_conn_info == dict(old_conn_info, host="otherhost.example.com", port="11111") def test_replication_positions(self): standby_nodes = { '10.255.255.10': { 'connection': True, 'db_time': '2014-08-28T14:09:57.919301+00:00Z', 'fetch_time': '2014-08-28T14:09:57.918753Z', 'pg_is_in_recovery': True, 'pg_last_xlog_receive_location': '0/9000090', 'pg_last_xlog_replay_location': '0/9000090', 'pg_last_xact_replay_timestamp': '2014-08-28T14:05:43.577357+00:00Z', 'replication_time_lag': 254.341944, }, } # the above node shouldn't show up as it's fetch_time is (way) older than 20 seconds positions = {} assert self.pglookout.get_replication_positions(standby_nodes) == positions standby_nodes['10.255.255.10']['fetch_time'] = get_iso_timestamp() positions[0x9000090] = set(['10.255.255.10']) assert self.pglookout.get_replication_positions(standby_nodes) == positions # add another standby, further ahead standby_nodes['10.255.255.11'] = dict(standby_nodes['10.255.255.10'], pg_last_xlog_receive_location='1/0000AAAA') positions[1 << 32 | 0xAAAA] = set(['10.255.255.11']) assert self.pglookout.get_replication_positions(standby_nodes) == positions # add another standby which hasn't received anything standby_nodes['10.255.255.12'] = dict(standby_nodes['10.255.255.10'], pg_last_xlog_receive_location=None) positions[0x9000090].add('10.255.255.12') assert self.pglookout.get_replication_positions(standby_nodes) == positions def test_node_map(self): cluster_state = { "10.255.255.10": { "connection": True, "db_time": "2014-08-28T14:26:51.067084+00:00Z", "fetch_time": "2014-08-28T14:26:51.066368Z", "pg_is_in_recovery": False, "pg_last_xact_replay_timestamp": "2014-08-28T14:05:43.577357+00:00Z", "pg_last_xlog_receive_location": "0/9000090", "pg_last_xlog_replay_location": "0/9000090", "replication_time_lag": 1267.489727, }, "10.255.255.9": { "connection": False, "fetch_time": "2014-08-28T14:26:51.068151Z", } } observer_state = { "10.255.255.11": { "10.255.255.10": { "connection": True, "db_time": "2014-08-28T14:26:47.105901+00:00Z", "fetch_time": "2014-08-28T14:26:47.104849Z", "pg_is_in_recovery": False, "pg_last_xact_replay_timestamp": "2014-08-28T14:05:43.577357+00:00Z", "pg_last_xlog_receive_location": "0/9000090", "pg_last_xlog_replay_location": "0/9000090", "replication_time_lag": 1263.528544, }, "10.255.255.9": { "connection": False, "db_time": "2014-08-28T14:06:15.172820+00:00Z", "fetch_time": "2014-08-28T14:26:47.107115Z", "pg_is_in_recovery": False, "pg_last_xact_replay_timestamp": None, "pg_last_xlog_receive_location": None, "pg_last_xlog_replay_location": None, }, "connection": True, "fetch_time": "2014-08-28T14:26:51.069891Z", } } master_host, _, standby_nodes = self.pglookout.create_node_map(cluster_state, observer_state) self.assertEqual(master_host, "10.255.255.10") self.assertEqual(standby_nodes, {}) def test_node_map_disconnected_current_master(self): self.pglookout.current_master = "10.255.255.7" cluster_state = { "10.255.255.7": { "connection": False, "db_time": "2014-09-07T15:26:23.957151+00:00Z", "fetch_time": "2014-09-07T15:26:34.736495Z", "pg_is_in_recovery": False, "pg_last_xact_replay_timestamp": None, "pg_last_xlog_receive_location": None, "pg_last_xlog_replay_location": None, }, "10.255.255.8": { "connection": True, "db_time": "2014-09-07T15:26:23.959461+00:00Z", "fetch_time": "2014-09-07T15:26:23.919281Z", "pg_is_in_recovery": True, "pg_last_xact_replay_timestamp": "2014-09-07T15:25:40.372936+00:00Z", "pg_last_xlog_receive_location": "0/74713D8", "pg_last_xlog_replay_location": "0/74713D8", "replication_time_lag": 43.586525, } } observer_state = {} master_host, _, standby_nodes = self.pglookout.create_node_map(cluster_state, observer_state) self.assertEqual(master_host, "10.255.255.7") self.assertEqual(list(standby_nodes.keys())[0], "10.255.255.8") def test_standbys_failover_equal_replication_positions(self): now = get_iso_timestamp(datetime.datetime.utcnow()) self.pglookout.cluster_state = { "192.168.54.183": { "connection": True, "db_time": now, "fetch_time": now, "pg_is_in_recovery": True, "pg_last_xact_replay_timestamp": "2015-04-28T11:21:56.098946+00:00Z", "pg_last_xlog_receive_location": "0/70004D8", "pg_last_xlog_replay_location": "0/70004D8", "replication_time_lag": 400.435871, }, "192.168.57.180": { "connection": False, "db_time": "2015-04-28T11:21:55.830432Z", "fetch_time": now, "pg_is_in_recovery": False, "pg_last_xact_replay_timestamp": None, "pg_last_xlog_receive_location": None, "pg_last_xlog_replay_location": None, "replication_time_lag": 0.0, }, "192.168.63.4": { "connection": True, "db_time": now, "fetch_time": now, "pg_is_in_recovery": True, "pg_last_xact_replay_timestamp": "2015-04-28T11:21:56.098946+00:00Z", "pg_last_xlog_receive_location": "0/70004D8", "pg_last_xlog_replay_location": "0/70004D8", "replication_time_lag": 401.104655, }, } self.pglookout.current_master = "192.168.57.180" # We select the node with the "highest" identifier so call_count should stay zero if we're not the # highest standby currently. self.pglookout.own_db = "192.168.54.183" self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 0) # If we're the highest we should see call_count increment self.pglookout.own_db = "192.168.63.4" self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 1)
def setUp(self): self.pglookout = PgLookout("pglookout.json") self.pglookout.execute_external_command = Mock() self.state_file_path = tempfile.gettempdir() + os.sep + "state_file"
class TestPgLookout(TestCase): def setUp(self): self.pglookout = PgLookout("pglookout.json") self.pglookout.execute_external_command = Mock() self.state_file_path = tempfile.gettempdir() + os.sep + "state_file" def test_parse_iso_datetime(self): date = datetime.datetime.utcnow() date.replace(microsecond=0) self.assertEqual(date, parse_iso_datetime(date.isoformat() + "Z")) def test_state_file_write(self): self.pglookout.config['json_state_file_path'] = self.state_file_path self.pglookout.write_cluster_state_to_json_file() self.assertTrue(os.path.exists(self.state_file_path)) self.assertTrue(os.path.getsize(self.state_file_path), 2) os.unlink(self.state_file_path) def test_load_config(self): self.pglookout.own_db = "old_value" self.pglookout.load_config() self.assertEqual(self.pglookout.own_db, "1.2.3.4") def _add_to_observer_state(self, observer_name, db_name, pg_last_xlog_receive_location=None, pg_is_in_recovery=True, connection=True, replication_time_lag=None, fetch_time=None, db_time=None): db_node_state = _create_db_node_state(pg_last_xlog_receive_location, pg_is_in_recovery, connection, replication_time_lag, fetch_time=fetch_time, db_time=db_time) update_dict = { "fetch_time": get_iso_timestamp(), "connection": True, db_name: db_node_state } if observer_name in self.pglookout.observer_state: self.pglookout.observer_state[observer_name].update(update_dict) else: self.pglookout.observer_state[observer_name] = update_dict def _add_db_to_cluster_state(self, db_name, pg_last_xlog_receive_location=None, pg_is_in_recovery=True, connection=True, replication_time_lag=None, fetch_time=None, db_time=None): db_node_state = _create_db_node_state(pg_last_xlog_receive_location, pg_is_in_recovery, connection, replication_time_lag, fetch_time=fetch_time, db_time=db_time) self.pglookout.cluster_state[db_name] = db_node_state def test_check_cluster_state_warning(self): self._add_db_to_cluster_state( "kuu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=40.0) self.pglookout.own_db = "kuu" self.pglookout.over_warning_limit_command = "fake_command" self.pglookout.execute_external_command.return_value = 0 self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 1) self.assertTrue(os.path.exists("replication_delay_warning")) self.pglookout.check_cluster_state() # call count does not change when we have sent a single warning self.assertEqual(self.pglookout.execute_external_command.call_count, 1) self.assertTrue(self.pglookout.replication_lag_over_warning_limit) self.assertTrue(os.path.exists("replication_delay_warning")) # and then the replication catches up self._add_db_to_cluster_state( "kuu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=5.0) self.pglookout.check_cluster_state() self.assertFalse(os.path.exists("replication_delay_warning")) self.assertFalse(self.pglookout.replication_lag_over_warning_limit) def test_check_cluster_do_failover_one_slave(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_db_to_cluster_state( "own_db", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "own_db" self.pglookout.execute_external_command.return_value = 0 self.pglookout.replication_lag_over_warning_limit = False self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 1) self.assertFalse(self.pglookout.replication_lag_over_warning_limit) def test_check_cluster_do_failover_one_slave_one_observer(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_db_to_cluster_state( "own_db", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "own_db" self._add_to_observer_state("observer", "old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_to_observer_state("observer", "own_db", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.execute_external_command.return_value = 0 self.pglookout.replication_lag_over_warning_limit = False self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 1) self.assertFalse(self.pglookout.replication_lag_over_warning_limit) def test_check_cluster_do_failover_with_a_node_which_is_is_maintenance( self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False) self._add_db_to_cluster_state( "kuu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) open("/tmp/pglookout_maintenance_mode_file", "w").write("foo") self.pglookout.never_promote_these_nodes = [] self.pglookout.own_db = "kuu" self.pglookout.execute_external_command.return_value = 0 self.pglookout.replication_lag_over_warning_limit = True self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit) def test_check_cluster_do_failover_with_a_node_which_should_never_be_promoted( self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False) self._add_db_to_cluster_state( "kuu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.never_promote_these_nodes = ["kuu"] self.pglookout.own_db = "kuu" self.pglookout.execute_external_command.return_value = 0 self.pglookout.replication_lag_over_warning_limit = True self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit) def test_check_cluster_do_failover_two_slaves(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False) self._add_db_to_cluster_state( "kuu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "kuu" # we put the second slave _WELL_ ahead self._add_db_to_cluster_state( "puu", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.execute_external_command.return_value = 0 self.pglookout.replication_lag_over_warning_limit = True self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit ) # we keep the warning on def test_check_cluster_do_failover_two_slaves_when_the_one_ahead_can_never_be_promoted( self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_db_to_cluster_state( "kuu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "kuu" # we put the second slave _WELL_ ahead self._add_db_to_cluster_state( "puu", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.never_promote_these_nodes = ["puu"] self.pglookout.execute_external_command.return_value = 0 self.pglookout.replication_lag_over_warning_limit = True self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 1) self.assertFalse(self.pglookout.replication_lag_over_warning_limit) def test_failover_over_replication_lag_when_still_connected_to_master( self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False) # We will make our own node to be the furthest along so we get considered for promotion self._add_db_to_cluster_state( "kuu", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "kuu" self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit ) # we keep the warning on def test_failover_over_replication_lag_with_one_observer_one_slave_no_connections( self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False) # We will make our own node to be the furthest along so we get considered for promotion self._add_db_to_cluster_state( "own_db", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "own_db" self._add_to_observer_state("observer", "old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_to_observer_state("observer", "own_db", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=False, replication_time_lag=130.0) self.pglookout.observer_state["observer"]['connection'] = False self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit ) # we keep the warning on def test_failover_no_connections(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False) # We will make our own node to be the furthest along so we get considered for promotion self._add_db_to_cluster_state( "kuu", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "kuu" # we put the second slave _WELL_ ahead self._add_db_to_cluster_state( "puu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=False, replication_time_lag=130.0) self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit ) # we keep the warning on def test_failover_master_two_slaves_one_observer_no_connection_between_slaves( self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) # We will make our own node to be the furthest along so we get considered for promotion self._add_db_to_cluster_state( "own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "own" self._add_db_to_cluster_state( "other", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=False, replication_time_lag=130.0) # Add observer state self._add_to_observer_state("observer", "old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_to_observer_state("observer", "other", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self._add_to_observer_state("observer", "own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.execute_external_command.return_value = 0 self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 1) self.assertFalse(self.pglookout.replication_lag_over_warning_limit ) # we keep the warning on def test_failover_master_one_slave_one_observer_no_connections(self): self.pglookout.own_db = "own" # Add observer state self._add_to_observer_state("observer", "old_master", pg_is_in_recovery=False, connection=True) # add db state self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=True) self._add_db_to_cluster_state( "own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=40.0) self.pglookout.check_cluster_state() self.assertTrue(self.pglookout.replication_lag_over_warning_limit ) # we keep the warning on self.assertEqual(self.pglookout.execute_external_command.call_count, 0) # Add observer state self._add_to_observer_state("observer", "old_master", pg_is_in_recovery=False, connection=True) self._add_to_observer_state("observer", "own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=9.0) self._add_db_to_cluster_state( "own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=140.0) self.pglookout.check_cluster_state() # No failover yet self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit ) # we keep the warning on #observer state self._add_to_observer_state("observer", "old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_to_observer_state("observer", "own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=False, replication_time_lag=140.0) # lose own connection to master self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) # now do failover self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 1) def test_find_current_master(self): self._add_db_to_cluster_state("master", pg_is_in_recovery=False, connection=True) # We will make our own node to be the furthest along so we get considered for promotion self._add_db_to_cluster_state( "own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=0.1) self.pglookout.own_db = "master" self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.current_master, "master") def test_replication_positions(self): standby_nodes = { '10.255.255.10': { 'fetch_time': '2014-08-28T14:09:57.918753Z', 'pg_last_xlog_receive_location': '0/9000090', 'pg_is_in_recovery': True, 'pg_last_xact_replay_timestamp': '2014-08-28T14:05:43.577357+00:00Z', 'connection': True, 'pg_last_xlog_replay_location': '0/9000090', 'replication_time_lag': 254.341944, 'db_time': '2014-08-28T14:09:57.919301+00:00Z' } } self.pglookout.get_replication_positions(standby_nodes) def test_node_map(self): cluster_state = { '10.255.255.10': { 'fetch_time': '2014-08-28T14:26:51.066368Z', 'pg_last_xlog_receive_location': '0/9000090', 'pg_is_in_recovery': False, 'pg_last_xact_replay_timestamp': '2014-08-28T14:05:43.577357+00:00Z', 'connection': True, 'pg_last_xlog_replay_location': '0/9000090', 'replication_time_lag': 1267.489727, 'db_time': '2014-08-28T14:26:51.067084+00:00Z' }, '10.255.255.9': { 'connection': False, 'fetch_time': '2014-08-28T14:26:51.068151Z' } } observer_state = { '10.255.255.11': { 'connection': True, 'fetch_time': '2014-08-28T14:26:51.069891Z', '10.255.255.10': { 'fetch_time': '2014-08-28T14:26:47.104849Z', 'pg_last_xlog_receive_location': '0/9000090', 'pg_is_in_recovery': False, 'pg_last_xact_replay_timestamp': '2014-08-28T14:05:43.577357+00:00Z', 'connection': True, 'pg_last_xlog_replay_location': '0/9000090', 'replication_time_lag': 1263.528544, 'db_time': '2014-08-28T14:26:47.105901+00:00Z' }, '10.255.255.9': { 'fetch_time': '2014-08-28T14:26:47.107115Z', 'pg_last_xlog_receive_location': None, 'pg_is_in_recovery': False, 'pg_last_xact_replay_timestamp': None, 'connection': False, 'pg_last_xlog_replay_location': None, 'db_time': '2014-08-28T14:06:15.172820+00:00Z' } } } master_host, _, standby_nodes = self.pglookout.create_node_map( cluster_state, observer_state) self.assertEqual(master_host, "10.255.255.10") self.assertEqual(standby_nodes, {}) def test_node_map_disconnected_current_master(self): self.pglookout.current_master = "10.255.255.7" cluster_state = { '10.255.255.7': { 'fetch_time': '2014-09-07T15:26:34.736495Z', 'pg_last_xlog_receive_location': None, 'pg_is_in_recovery': False, 'pg_last_xact_replay_timestamp': None, 'connection': False, 'pg_last_xlog_replay_location': None, 'db_time': '2014-09-07T15:26:23.957151+00:00Z' }, '10.255.255.8': { 'fetch_time': '2014-09-07T15:26:23.919281Z', 'pg_last_xlog_receive_location': '0/74713D8', 'pg_is_in_recovery': True, 'pg_last_xact_replay_timestamp': '2014-09-07T15:25:40.372936+00:00Z', 'connection': True, 'pg_last_xlog_replay_location': '0/74713D8', 'replication_time_lag': 43.586525000000002, 'db_time': '2014-09-07T15:26:23.959461+00:00Z' } } observer_state = {} master_host, _, standby_nodes = self.pglookout.create_node_map( cluster_state, observer_state) self.assertEqual(master_host, "10.255.255.7") self.assertEqual(list(standby_nodes.keys())[0], "10.255.255.8") def tearDown(self): if os.path.exists(self.state_file_path): os.unlink(self.state_file_path) if os.path.exists("/tmp/pglookout_maintenance_mode_file"): os.unlink("/tmp/pglookout_maintenance_mode_file") if os.path.exists("replication_delay_warning"): os.unlink("replication_delay_warning") if os.path.exists("failover_has_happened"): os.unlink("failover_has_happened")
class TestPgLookout(TestCase): def setUp(self): self.pglookout = PgLookout("pglookout.json") self.pglookout.execute_external_command = Mock() self.state_file_path = tempfile.gettempdir() + os.sep + "state_file" def test_parse_iso_datetime(self): date = datetime.datetime.utcnow() date.replace(microsecond=0) self.assertEqual(date, parse_iso_datetime(date.isoformat() + "Z")) def test_state_file_write(self): self.pglookout.config['json_state_file_path'] = self.state_file_path self.pglookout.write_cluster_state_to_json_file() self.assertTrue(os.path.exists(self.state_file_path)) self.assertTrue(os.path.getsize(self.state_file_path), 2) os.unlink(self.state_file_path) def test_load_config(self): self.pglookout.own_db = "old_value" self.pglookout.load_config() self.assertEqual(self.pglookout.own_db, "1.2.3.4") def _add_to_observer_state(self, observer_name, db_name, pg_last_xlog_receive_location=None, pg_is_in_recovery=True, connection=True, replication_time_lag=None, fetch_time=None, db_time=None): db_node_state = _create_db_node_state(pg_last_xlog_receive_location, pg_is_in_recovery, connection, replication_time_lag, fetch_time=fetch_time, db_time=db_time) update_dict = {"fetch_time": get_iso_timestamp(), "connection": True, db_name: db_node_state} if observer_name in self.pglookout.observer_state: self.pglookout.observer_state[observer_name].update(update_dict) else: self.pglookout.observer_state[observer_name] = update_dict def _add_db_to_cluster_state(self, db_name, pg_last_xlog_receive_location=None, pg_is_in_recovery=True, connection=True, replication_time_lag=None, fetch_time=None, db_time=None): db_node_state = _create_db_node_state(pg_last_xlog_receive_location, pg_is_in_recovery, connection, replication_time_lag, fetch_time=fetch_time, db_time=db_time) self.pglookout.cluster_state[db_name] = db_node_state def test_check_cluster_state_warning(self): self._add_db_to_cluster_state("kuu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=40.0) self.pglookout.own_db = "kuu" self.pglookout.over_warning_limit_command = "fake_command" self.pglookout.execute_external_command.return_value = 0 self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 1) self.assertTrue(os.path.exists("replication_delay_warning")) self.pglookout.check_cluster_state() # call count does not change when we have sent a single warning self.assertEqual(self.pglookout.execute_external_command.call_count, 1) self.assertTrue(self.pglookout.replication_lag_over_warning_limit) self.assertTrue(os.path.exists("replication_delay_warning")) # and then the replication catches up self._add_db_to_cluster_state("kuu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=5.0) self.pglookout.check_cluster_state() self.assertFalse(os.path.exists("replication_delay_warning")) self.assertFalse(self.pglookout.replication_lag_over_warning_limit) def test_check_cluster_do_failover_one_slave(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_db_to_cluster_state("own_db", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "own_db" self.pglookout.execute_external_command.return_value = 0 self.pglookout.replication_lag_over_warning_limit = False self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 1) self.assertFalse(self.pglookout.replication_lag_over_warning_limit) def test_check_cluster_do_failover_one_slave_one_observer(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_db_to_cluster_state("own_db", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "own_db" self._add_to_observer_state("observer", "old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_to_observer_state("observer", "own_db", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.execute_external_command.return_value = 0 self.pglookout.replication_lag_over_warning_limit = False self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 1) self.assertFalse(self.pglookout.replication_lag_over_warning_limit) def test_check_cluster_do_failover_with_a_node_which_is_is_maintenance(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False) self._add_db_to_cluster_state("kuu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) open("/tmp/pglookout_maintenance_mode_file", "w").write("foo") self.pglookout.never_promote_these_nodes = [] self.pglookout.own_db = "kuu" self.pglookout.execute_external_command.return_value = 0 self.pglookout.replication_lag_over_warning_limit = True self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit) def test_check_cluster_do_failover_with_a_node_which_should_never_be_promoted(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False) self._add_db_to_cluster_state("kuu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.never_promote_these_nodes = ["kuu"] self.pglookout.own_db = "kuu" self.pglookout.execute_external_command.return_value = 0 self.pglookout.replication_lag_over_warning_limit = True self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit) def test_check_cluster_do_failover_two_slaves(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False) self._add_db_to_cluster_state("kuu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "kuu" # we put the second slave _WELL_ ahead self._add_db_to_cluster_state("puu", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.execute_external_command.return_value = 0 self.pglookout.replication_lag_over_warning_limit = True self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit) # we keep the warning on def test_check_cluster_do_failover_two_slaves_when_the_one_ahead_can_never_be_promoted(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_db_to_cluster_state("kuu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "kuu" # we put the second slave _WELL_ ahead self._add_db_to_cluster_state("puu", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.never_promote_these_nodes = ["puu"] self.pglookout.execute_external_command.return_value = 0 self.pglookout.replication_lag_over_warning_limit = True self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 1) self.assertFalse(self.pglookout.replication_lag_over_warning_limit) def test_failover_over_replication_lag_when_still_connected_to_master(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False) # We will make our own node to be the furthest along so we get considered for promotion self._add_db_to_cluster_state("kuu", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "kuu" self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit) # we keep the warning on def test_failover_over_replication_lag_with_one_observer_one_slave_no_connections(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False) # We will make our own node to be the furthest along so we get considered for promotion self._add_db_to_cluster_state("own_db", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "own_db" self._add_to_observer_state("observer", "old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_to_observer_state("observer", "own_db", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=False, replication_time_lag=130.0) self.pglookout.observer_state["observer"]['connection'] = False self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit) # we keep the warning on def test_failover_no_connections(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False) # We will make our own node to be the furthest along so we get considered for promotion self._add_db_to_cluster_state("kuu", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "kuu" # we put the second slave _WELL_ ahead self._add_db_to_cluster_state("puu", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=False, replication_time_lag=130.0) self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit) # we keep the warning on def test_failover_master_two_slaves_one_observer_no_connection_between_slaves(self): self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) # We will make our own node to be the furthest along so we get considered for promotion self._add_db_to_cluster_state("own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.own_db = "own" self._add_db_to_cluster_state("other", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=False, replication_time_lag=130.0) # Add observer state self._add_to_observer_state("observer", "old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_to_observer_state("observer", "other", pg_last_xlog_receive_location="1/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self._add_to_observer_state("observer", "own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=130.0) self.pglookout.execute_external_command.return_value = 0 self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 1) self.assertFalse(self.pglookout.replication_lag_over_warning_limit) # we keep the warning on def test_failover_master_one_slave_one_observer_no_connections(self): self.pglookout.own_db = "own" # Add observer state self._add_to_observer_state("observer", "old_master", pg_is_in_recovery=False, connection=True) # add db state self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=True) self._add_db_to_cluster_state("own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=40.0) self.pglookout.check_cluster_state() self.assertTrue(self.pglookout.replication_lag_over_warning_limit) # we keep the warning on self.assertEqual(self.pglookout.execute_external_command.call_count, 0) # Add observer state self._add_to_observer_state("observer", "old_master", pg_is_in_recovery=False, connection=True) self._add_to_observer_state("observer", "own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=9.0) self._add_db_to_cluster_state("own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=140.0) self.pglookout.check_cluster_state() # No failover yet self.assertEqual(self.pglookout.execute_external_command.call_count, 0) self.assertTrue(self.pglookout.replication_lag_over_warning_limit) # we keep the warning on #observer state self._add_to_observer_state("observer", "old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) self._add_to_observer_state("observer", "own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=False, replication_time_lag=140.0) # lose own connection to master self._add_db_to_cluster_state("old_master", pg_is_in_recovery=False, connection=False, db_time=datetime.datetime(year=2014, month=1, day=1)) # now do failover self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.execute_external_command.call_count, 1) def test_find_current_master(self): self._add_db_to_cluster_state("master", pg_is_in_recovery=False, connection=True) # We will make our own node to be the furthest along so we get considered for promotion self._add_db_to_cluster_state("own", pg_last_xlog_receive_location="2/aaaaaaaa", pg_is_in_recovery=True, connection=True, replication_time_lag=0.1) self.pglookout.own_db = "master" self.pglookout.check_cluster_state() self.assertEqual(self.pglookout.current_master, "master") def test_replication_positions(self): standby_nodes = {'10.255.255.10': {'fetch_time': '2014-08-28T14:09:57.918753Z', 'pg_last_xlog_receive_location': '0/9000090', 'pg_is_in_recovery': True, 'pg_last_xact_replay_timestamp': '2014-08-28T14:05:43.577357+00:00Z', 'connection': True, 'pg_last_xlog_replay_location': '0/9000090', 'replication_time_lag': 254.341944, 'db_time': '2014-08-28T14:09:57.919301+00:00Z'}} self.pglookout.get_replication_positions(standby_nodes) def test_node_map(self): cluster_state = {'10.255.255.10': {'fetch_time': '2014-08-28T14:26:51.066368Z', 'pg_last_xlog_receive_location': '0/9000090', 'pg_is_in_recovery': False, 'pg_last_xact_replay_timestamp': '2014-08-28T14:05:43.577357+00:00Z', 'connection': True, 'pg_last_xlog_replay_location': '0/9000090', 'replication_time_lag': 1267.489727, 'db_time': '2014-08-28T14:26:51.067084+00:00Z'}, '10.255.255.9': {'connection': False, 'fetch_time': '2014-08-28T14:26:51.068151Z'}} observer_state = {'10.255.255.11': {'connection': True, 'fetch_time': '2014-08-28T14:26:51.069891Z', '10.255.255.10': {'fetch_time': '2014-08-28T14:26:47.104849Z', 'pg_last_xlog_receive_location': '0/9000090', 'pg_is_in_recovery': False, 'pg_last_xact_replay_timestamp': '2014-08-28T14:05:43.577357+00:00Z', 'connection': True, 'pg_last_xlog_replay_location': '0/9000090', 'replication_time_lag': 1263.528544, 'db_time': '2014-08-28T14:26:47.105901+00:00Z'}, '10.255.255.9': {'fetch_time': '2014-08-28T14:26:47.107115Z', 'pg_last_xlog_receive_location': None, 'pg_is_in_recovery': False, 'pg_last_xact_replay_timestamp': None, 'connection': False, 'pg_last_xlog_replay_location': None, 'db_time': '2014-08-28T14:06:15.172820+00:00Z'}}} master_host, _, standby_nodes = self.pglookout.create_node_map(cluster_state, observer_state) self.assertEqual(master_host, "10.255.255.10") self.assertEqual(standby_nodes, {}) def test_node_map_disconnected_current_master(self): self.pglookout.current_master = "10.255.255.7" cluster_state = {'10.255.255.7': {'fetch_time': '2014-09-07T15:26:34.736495Z', 'pg_last_xlog_receive_location': None, 'pg_is_in_recovery': False, 'pg_last_xact_replay_timestamp': None, 'connection': False, 'pg_last_xlog_replay_location': None, 'db_time': '2014-09-07T15:26:23.957151+00:00Z'}, '10.255.255.8': {'fetch_time': '2014-09-07T15:26:23.919281Z', 'pg_last_xlog_receive_location': '0/74713D8', 'pg_is_in_recovery': True, 'pg_last_xact_replay_timestamp': '2014-09-07T15:25:40.372936+00:00Z', 'connection': True, 'pg_last_xlog_replay_location': '0/74713D8', 'replication_time_lag': 43.586525000000002, 'db_time': '2014-09-07T15:26:23.959461+00:00Z'}} observer_state = {} master_host, _, standby_nodes = self.pglookout.create_node_map(cluster_state, observer_state) self.assertEqual(master_host, "10.255.255.7") self.assertEqual(list(standby_nodes.keys())[0], "10.255.255.8") def tearDown(self): if os.path.exists(self.state_file_path): os.unlink(self.state_file_path) if os.path.exists("/tmp/pglookout_maintenance_mode_file"): os.unlink("/tmp/pglookout_maintenance_mode_file") if os.path.exists("replication_delay_warning"): os.unlink("replication_delay_warning") if os.path.exists("failover_has_happened"): os.unlink("failover_has_happened")