class TestHa(unittest.TestCase): @patch('socket.getaddrinfo', socket_getaddrinfo) @patch('psycopg2.connect', psycopg2_connect) @patch('patroni.dcs.dcs_modules', Mock(return_value=['patroni.dcs.foo', 'patroni.dcs.etcd'])) @patch.object(etcd.Client, 'read', etcd_read) def setUp(self): with patch.object(Client, 'machines') as mock_machines: mock_machines.__get__ = Mock( return_value=['http://*****:*****@patch.object(Postgresql, 'fix_cluster_state', Mock()) def test_crash_recovery(self): self.p.is_running = false self.p.controldata = lambda: { 'Database cluster state': 'in production' } self.assertEquals(self.ha.run_cycle(), 'doing crash recovery in a single user mode') @patch.object(Postgresql, 'rewind_needed_and_possible', Mock(return_value=True)) def test_recover_with_rewind(self): self.p.is_running = false self.ha.cluster = get_cluster_initialized_with_leader() self.assertEquals(self.ha.run_cycle(), 'running pg_rewind from leader') @patch.object(Postgresql, 'can_rewind', PropertyMock(return_value=True)) @patch.object(Postgresql, 'fix_cluster_state', Mock()) def test_single_user_after_recover_failed(self): self.p.controldata = lambda: {'Database cluster state': 'in recovery'} self.p.is_running = false self.p.follow = false self.assertEquals(self.ha.run_cycle(), 'starting as a secondary') self.assertEquals(self.ha.run_cycle(), 'fixing cluster state in a single user mode') @patch('sys.exit', return_value=1) @patch('patroni.ha.Ha.sysid_valid', MagicMock(return_value=True)) def test_sysid_no_match(self, exit_mock): self.ha.run_cycle() exit_mock.assert_called_once_with(1) @patch.object(Cluster, 'is_unlocked', Mock(return_value=False)) def test_start_as_readonly(self): self.p.is_leader = false self.p.is_healthy = true self.ha.has_lock = true self.assertEquals( self.ha.run_cycle(), 'promoted self to leader because i had the session lock') def test_acquire_lock_as_master(self): self.assertEquals(self.ha.run_cycle(), 'acquired session lock as a leader') def test_promoted_by_acquiring_lock(self): self.ha.is_healthiest_node = true self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') def test_demote_after_failing_to_obtain_lock(self): self.ha.acquire_lock = false self.assertEquals( self.ha.run_cycle(), 'demoted self after trying and failing to obtain lock') def test_follow_new_leader_after_failing_to_obtain_lock(self): self.ha.is_healthiest_node = true self.ha.acquire_lock = false self.p.is_leader = false self.assertEquals( self.ha.run_cycle(), 'following new leader after trying and failing to obtain lock') def test_demote_because_not_healthiest(self): self.ha.is_healthiest_node = false self.assertEquals( self.ha.run_cycle(), 'demoting self because i am not the healthiest node') def test_follow_new_leader_because_not_healthiest(self): self.ha.is_healthiest_node = false self.p.is_leader = false self.assertEquals( self.ha.run_cycle(), 'following a different leader because i am not the healthiest node' ) def test_promote_because_have_lock(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.p.is_leader = false self.assertEquals( self.ha.run_cycle(), 'promoted self to leader because i had the session lock') def test_promote_without_watchdog(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.p.is_leader = true with patch.object(Watchdog, 'activate', Mock(return_value=False)): self.assertEquals( self.ha.run_cycle(), 'Demoting self because watchdog could not be activated') self.p.is_leader = false self.assertEquals( self.ha.run_cycle(), 'Not promoting self because watchdog could not be activated') def test_leader_with_lock(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') def test_demote_because_not_having_lock(self): self.ha.cluster.is_unlocked = false with patch.object(Watchdog, 'is_running', PropertyMock(return_value=True)): self.assertEquals( self.ha.run_cycle(), 'demoting self because i do not have the lock and i was a leader' ) def test_demote_because_update_lock_failed(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.ha.update_lock = false self.assertEquals( self.ha.run_cycle(), 'demoted self because failed to update leader lock in DCS') self.p.is_leader = false self.assertEquals( self.ha.run_cycle(), 'not promoting because failed to update leader lock in DCS') def test_follow(self): self.ha.cluster.is_unlocked = false self.p.is_leader = false self.assertEquals( self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader') self.ha.patroni.replicatefrom = "foo" self.assertEquals( self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader') def test_follow_in_pause(self): self.ha.cluster.is_unlocked = false self.ha.is_paused = true self.assertEquals(self.ha.run_cycle(), 'PAUSE: continue to run as master without lock') self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'PAUSE: no action') @patch.object(Postgresql, 'rewind_needed_and_possible', Mock(return_value=True)) def test_follow_triggers_rewind(self): self.p.is_leader = false self.p.trigger_check_diverged_lsn() self.ha.cluster = get_cluster_initialized_with_leader() self.assertEquals(self.ha.run_cycle(), 'running pg_rewind from leader') def test_no_etcd_connection_master_demote(self): self.ha.load_cluster_from_dcs = Mock( side_effect=DCSError('Etcd is not responding properly')) self.assertEquals( self.ha.run_cycle(), 'demoted self because DCS is not accessible and i was a leader') @patch('time.sleep', Mock()) def test_bootstrap_from_another_member(self): self.ha.cluster = get_cluster_initialized_with_leader() self.assertEquals(self.ha.bootstrap(), 'trying to bootstrap from replica \'other\'') def test_bootstrap_waiting_for_leader(self): self.ha.cluster = get_cluster_initialized_without_leader() self.assertEquals(self.ha.bootstrap(), 'waiting for leader to bootstrap') def test_bootstrap_without_leader(self): self.ha.cluster = get_cluster_initialized_without_leader() self.p.can_create_replica_without_replication_connection = MagicMock( return_value=True) self.assertEquals(self.ha.bootstrap(), 'trying to bootstrap (without leader)') def test_bootstrap_initialize_lock_failed(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.assertEquals(self.ha.bootstrap(), 'failed to acquire initialize lock') def test_bootstrap_initialized_new_cluster(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.e.initialize = true self.assertEquals(self.ha.bootstrap(), 'trying to bootstrap a new cluster') self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'waiting for end of recovery after bootstrap') self.p.is_leader = true self.assertEquals(self.ha.run_cycle(), 'running post_bootstrap') self.assertEquals(self.ha.run_cycle(), 'initialized a new cluster') def test_bootstrap_release_initialize_key_on_failure(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.e.initialize = true self.ha.bootstrap() self.p.is_running = false self.assertRaises(PatroniException, self.ha.post_bootstrap) def test_bootstrap_release_initialize_key_on_watchdog_failure(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.e.initialize = true self.ha.bootstrap() self.p.is_running.return_value = MockPostmaster() self.p.is_leader = true with patch.object(Watchdog, 'activate', Mock(return_value=False)): self.assertEquals(self.ha.post_bootstrap(), 'running post_bootstrap') self.assertRaises(PatroniException, self.ha.post_bootstrap) @patch('psycopg2.connect', psycopg2_connect) def test_reinitialize(self): self.assertIsNotNone(self.ha.reinitialize()) self.ha.cluster = get_cluster_initialized_with_leader() self.assertIsNone(self.ha.reinitialize()) self.assertIsNotNone(self.ha.reinitialize()) self.ha.state_handler.name = self.ha.cluster.leader.name self.assertIsNotNone(self.ha.reinitialize()) @patch('time.sleep', Mock()) def test_restart(self): self.assertEquals(self.ha.restart({}), (True, 'restarted successfully')) self.p.restart = Mock(return_value=None) self.assertEquals(self.ha.restart({}), (False, 'postgres is still starting')) self.p.restart = false self.assertEquals(self.ha.restart({}), (False, 'restart failed')) self.ha.cluster = get_cluster_initialized_with_leader() self.ha.reinitialize() self.assertEquals(self.ha.restart({}), (False, 'reinitialize already in progress')) with patch.object(self.ha, "restart_matches", return_value=False): self.assertEquals(self.ha.restart({'foo': 'bar'}), (False, "restart conditions are not satisfied")) @patch('os.kill', Mock()) def test_restart_in_progress(self): with patch('patroni.async_executor.AsyncExecutor.busy', PropertyMock(return_value=True)): self.ha.restart({}, run_async=True) self.assertTrue(self.ha.restart_scheduled()) self.assertEquals(self.ha.run_cycle(), 'restart in progress') self.ha.cluster = get_cluster_initialized_with_leader() self.assertEquals(self.ha.run_cycle(), 'restart in progress') self.ha.has_lock = true self.assertEquals(self.ha.run_cycle(), 'updated leader lock during restart') self.ha.update_lock = false self.p.set_role('master') with patch('patroni.async_executor.CriticalTask.cancel', Mock(return_value=False)): with patch( 'patroni.postgresql.Postgresql.terminate_starting_postmaster' ) as mock_terminate: self.assertEquals(self.ha.run_cycle(), 'lost leader lock during restart') mock_terminate.assert_called() @patch('requests.get', requests_get) def test_manual_failover_from_leader(self): self.ha.fetch_node_status = get_node_status() self.ha.has_lock = true self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, 'blabla', '', None)) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, '', self.p.name, None)) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, '', 'blabla', None)) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') f = Failover(0, self.p.name, '', None) self.ha.cluster = get_cluster_initialized_with_leader(f) self.assertEquals(self.ha.run_cycle(), 'manual failover: demoting myself') self.p.rewind_needed_and_possible = true self.assertEquals(self.ha.run_cycle(), 'manual failover: demoting myself') self.ha.fetch_node_status = get_node_status(nofailover=True) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') self.ha.fetch_node_status = get_node_status(watchdog_failed=True) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') self.ha.fetch_node_status = get_node_status(wal_position=1) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') # manual failover from the previous leader to us won't happen if we hold the nofailover flag self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, 'blabla', self.p.name, None)) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') # Failover scheduled time must include timezone scheduled = datetime.datetime.now() self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, 'blabla', self.p.name, scheduled)) self.ha.run_cycle() scheduled = datetime.datetime.utcnow().replace(tzinfo=tzutc) self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, 'blabla', self.p.name, scheduled)) self.assertEquals('no action. i am the leader with the lock', self.ha.run_cycle()) scheduled = scheduled + datetime.timedelta(seconds=30) self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, 'blabla', self.p.name, scheduled)) self.assertEquals('no action. i am the leader with the lock', self.ha.run_cycle()) scheduled = scheduled + datetime.timedelta(seconds=-600) self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, 'blabla', self.p.name, scheduled)) self.assertEquals('no action. i am the leader with the lock', self.ha.run_cycle()) scheduled = None self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, 'blabla', self.p.name, scheduled)) self.assertEquals('no action. i am the leader with the lock', self.ha.run_cycle()) @patch('requests.get', requests_get) def test_manual_failover_from_leader_in_pause(self): self.ha.has_lock = true self.ha.is_paused = true scheduled = datetime.datetime.now() self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, 'blabla', self.p.name, scheduled)) self.assertEquals('PAUSE: no action. i am the leader with the lock', self.ha.run_cycle()) self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, self.p.name, '', None)) self.assertEquals('PAUSE: no action. i am the leader with the lock', self.ha.run_cycle()) @patch('requests.get', requests_get) def test_manual_failover_from_leader_in_synchronous_mode(self): self.p.is_leader = true self.ha.has_lock = true self.ha.is_synchronous_mode = true self.ha.is_failover_possible = false self.ha.process_sync_replication = Mock() self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, self.p.name, 'a', None), (self.p.name, None)) self.assertEquals('no action. i am the leader with the lock', self.ha.run_cycle()) self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, self.p.name, 'a', None), (self.p.name, 'a')) self.ha.is_failover_possible = true self.assertEquals('manual failover: demoting myself', self.ha.run_cycle()) @patch('requests.get', requests_get) def test_manual_failover_process_no_leader(self): self.p.is_leader = false self.ha.cluster = get_cluster_initialized_without_leader( failover=Failover(0, '', self.p.name, None)) self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') self.ha.cluster = get_cluster_initialized_without_leader( failover=Failover(0, '', 'leader', None)) self.p.set_role('replica') self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') self.ha.fetch_node_status = get_node_status( ) # accessible, in_recovery self.assertEquals( self.ha.run_cycle(), 'following a different leader because i am not the healthiest node' ) self.ha.cluster = get_cluster_initialized_without_leader( failover=Failover(0, self.p.name, '', None)) self.assertEquals( self.ha.run_cycle(), 'following a different leader because i am not the healthiest node' ) self.ha.fetch_node_status = get_node_status( reachable=False) # inaccessible, in_recovery self.p.set_role('replica') self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') # set failover flag to True for all members of the cluster # this should elect the current member, as we are not going to call the API for it. self.ha.cluster = get_cluster_initialized_without_leader( failover=Failover(0, '', 'other', None)) self.ha.fetch_node_status = get_node_status( nofailover=True) # accessible, in_recovery self.p.set_role('replica') self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') # same as previous, but set the current member to nofailover. In no case it should be elected as a leader self.ha.patroni.nofailover = True self.assertEquals( self.ha.run_cycle(), 'following a different leader because I am not allowed to promote') def test_manual_failover_process_no_leader_in_pause(self): self.ha.is_paused = true self.ha.cluster = get_cluster_initialized_without_leader( failover=Failover(0, '', 'other', None)) self.assertEquals(self.ha.run_cycle(), 'PAUSE: continue to run as master without lock') self.ha.cluster = get_cluster_initialized_without_leader( failover=Failover(0, 'leader', '', None)) self.assertEquals(self.ha.run_cycle(), 'PAUSE: continue to run as master without lock') self.ha.cluster = get_cluster_initialized_without_leader( failover=Failover(0, 'leader', 'blabla', None)) self.assertEquals('PAUSE: acquired session lock as a leader', self.ha.run_cycle()) self.p.is_leader = false self.p.set_role('replica') self.ha.cluster = get_cluster_initialized_without_leader( failover=Failover(0, 'leader', self.p.name, None)) self.assertEquals( self.ha.run_cycle(), 'PAUSE: promoted self to leader by acquiring session lock') def test_is_healthiest_node(self): self.ha.state_handler.is_leader = false self.ha.patroni.nofailover = False self.ha.fetch_node_status = get_node_status() self.assertTrue(self.ha.is_healthiest_node()) with patch.object(Watchdog, 'is_healthy', PropertyMock(return_value=False)): self.assertFalse(self.ha.is_healthiest_node()) with patch('patroni.postgresql.Postgresql.is_starting', return_value=True): self.assertFalse(self.ha.is_healthiest_node()) self.ha.is_paused = true self.assertFalse(self.ha.is_healthiest_node()) def test__is_healthiest_node(self): self.assertTrue( self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.p.is_leader = false self.ha.fetch_node_status = get_node_status( ) # accessible, in_recovery self.assertTrue( self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.fetch_node_status = get_node_status( in_recovery=False) # accessible, not in_recovery self.assertFalse( self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.fetch_node_status = get_node_status( wal_position=11) # accessible, in_recovery, wal position ahead self.assertFalse( self.ha._is_healthiest_node(self.ha.old_cluster.members)) with patch('patroni.postgresql.Postgresql.wal_position', return_value=1): self.assertFalse( self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.patroni.nofailover = True self.assertFalse( self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.patroni.nofailover = False @patch('requests.get', requests_get) def test_fetch_node_status(self): member = Member(0, 'test', 1, {'api_url': 'http://127.0.0.1:8011/patroni'}) self.ha.fetch_node_status(member) member = Member(0, 'test', 1, {'api_url': 'http://*****:*****@patch('patroni.ha.Ha.update_lock', return_value=True) @patch('patroni.ha.Ha.demote') def test_starting_timeout(self, demote, update_lock): def check_calls(seq): for mock, called in seq: if called: mock.assert_called_once() else: mock.assert_not_called() mock.reset_mock() self.ha.has_lock = true self.ha.cluster = get_cluster_initialized_with_leader() self.p.check_for_startup = true self.p.time_in_state = lambda: 30 self.assertEquals( self.ha.run_cycle(), 'PostgreSQL is still starting up, 270 seconds until timeout') check_calls([(update_lock, True), (demote, False)]) self.p.time_in_state = lambda: 350 self.ha.fetch_node_status = get_node_status( reachable=False) # inaccessible, in_recovery self.assertEquals( self.ha.run_cycle(), 'master start has timed out, but continuing to wait because failover is not possible' ) check_calls([(update_lock, True), (demote, False)]) self.ha.fetch_node_status = get_node_status( ) # accessible, in_recovery self.assertEquals(self.ha.run_cycle(), 'stopped PostgreSQL because of startup timeout') check_calls([(update_lock, True), (demote, True)]) update_lock.return_value = False self.assertEquals( self.ha.run_cycle(), 'stopped PostgreSQL while starting up because leader key was lost') check_calls([(update_lock, True), (demote, True)]) self.ha.has_lock = false self.p.is_leader = false self.assertEquals( self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader') check_calls([(update_lock, False), (demote, False)]) def test_manual_failover_while_starting(self): self.ha.has_lock = true self.p.check_for_startup = true f = Failover(0, self.p.name, '', None) self.ha.cluster = get_cluster_initialized_with_leader(f) self.ha.fetch_node_status = get_node_status( ) # accessible, in_recovery self.assertEquals(self.ha.run_cycle(), 'manual failover: demoting myself') @patch('patroni.ha.Ha.demote') def test_failover_immediately_on_zero_master_start_timeout(self, demote): self.p.is_running = false self.ha.cluster = get_cluster_initialized_with_leader( sync=(self.p.name, 'other')) self.ha.cluster.config.data['synchronous_mode'] = True self.ha.patroni.config.set_dynamic_configuration( {'master_start_timeout': 0}) self.ha.has_lock = true self.ha.update_lock = true self.ha.fetch_node_status = get_node_status( ) # accessible, in_recovery self.assertEquals(self.ha.run_cycle(), 'stopped PostgreSQL to fail over after a crash') demote.assert_called_once() @patch('patroni.postgresql.Postgresql.follow') def test_demote_immediate(self, follow): self.ha.has_lock = true self.e.get_cluster = Mock( return_value=get_cluster_initialized_without_leader()) self.ha.demote('immediate') follow.assert_called_once_with(None) def test_process_sync_replication(self): self.ha.has_lock = true mock_set_sync = self.p.set_synchronous_standby = Mock() self.p.name = 'leader' # Test sync key removed when sync mode disabled self.ha.cluster = get_cluster_initialized_with_leader(sync=('leader', 'other')) with patch.object(self.ha.dcs, 'delete_sync_state') as mock_delete_sync: self.ha.run_cycle() mock_delete_sync.assert_called_once() mock_set_sync.assert_called_once_with(None) mock_set_sync.reset_mock() # Test sync key not touched when not there self.ha.cluster = get_cluster_initialized_with_leader() with patch.object(self.ha.dcs, 'delete_sync_state') as mock_delete_sync: self.ha.run_cycle() mock_delete_sync.assert_not_called() mock_set_sync.assert_called_once_with(None) mock_set_sync.reset_mock() self.ha.is_synchronous_mode = true # Test sync standby not touched when picking the same node self.p.pick_synchronous_standby = Mock(return_value=('other', True)) self.ha.cluster = get_cluster_initialized_with_leader(sync=('leader', 'other')) self.ha.run_cycle() mock_set_sync.assert_not_called() mock_set_sync.reset_mock() # Test sync standby is replaced when switching standbys self.p.pick_synchronous_standby = Mock(return_value=('other2', False)) self.ha.dcs.write_sync_state = Mock(return_value=True) self.ha.run_cycle() mock_set_sync.assert_called_once_with('other2') mock_set_sync.reset_mock() # Test sync standby is not disabled when updating dcs fails self.ha.dcs.write_sync_state = Mock(return_value=False) self.ha.run_cycle() mock_set_sync.assert_not_called() mock_set_sync.reset_mock() # Test changing sync standby self.ha.dcs.write_sync_state = Mock(return_value=True) self.ha.dcs.get_cluster = Mock( return_value=get_cluster_initialized_with_leader(sync=('leader', 'other'))) # self.ha.cluster = get_cluster_initialized_with_leader(sync=('leader', 'other')) self.p.pick_synchronous_standby = Mock(return_value=('other2', True)) self.ha.run_cycle() self.ha.dcs.get_cluster.assert_called_once() self.assertEquals(self.ha.dcs.write_sync_state.call_count, 2) # Test updating sync standby key failed due to race self.ha.dcs.write_sync_state = Mock(side_effect=[True, False]) self.ha.run_cycle() self.assertEquals(self.ha.dcs.write_sync_state.call_count, 2) # Test changing sync standby failed due to race self.ha.dcs.write_sync_state = Mock(return_value=True) self.ha.dcs.get_cluster = Mock( return_value=get_cluster_initialized_with_leader( sync=('somebodyelse', None))) self.ha.run_cycle() self.assertEquals(self.ha.dcs.write_sync_state.call_count, 1) # Test sync set to '*' when synchronous_mode_strict is enabled mock_set_sync.reset_mock() self.ha.is_synchronous_mode_strict = true self.p.pick_synchronous_standby = Mock(return_value=(None, False)) self.ha.run_cycle() mock_set_sync.assert_called_once_with('*') def test_sync_replication_become_master(self): self.ha.is_synchronous_mode = true mock_set_sync = self.p.set_synchronous_standby = Mock() self.p.is_leader = false self.p.set_role('replica') self.ha.has_lock = true mock_write_sync = self.ha.dcs.write_sync_state = Mock( return_value=True) self.p.name = 'leader' self.ha.cluster = get_cluster_initialized_with_leader(sync=('other', None)) # When we just became master nobody is sync self.assertEquals(self.ha.enforce_master_role('msg', 'promote msg'), 'promote msg') mock_set_sync.assert_called_once_with(None) mock_write_sync.assert_called_once_with('leader', None, index=0) mock_set_sync.reset_mock() # When we just became master nobody is sync self.p.set_role('replica') mock_write_sync.return_value = False self.assertTrue( self.ha.enforce_master_role('msg', 'promote msg') != 'promote msg') mock_set_sync.assert_not_called() def test_unhealthy_sync_mode(self): self.ha.is_synchronous_mode = true self.p.is_leader = false self.p.set_role('replica') self.p.name = 'other' self.ha.cluster = get_cluster_initialized_without_leader( sync=('leader', 'other2')) mock_write_sync = self.ha.dcs.write_sync_state = Mock( return_value=True) mock_acquire = self.ha.acquire_lock = Mock(return_value=True) mock_follow = self.p.follow = Mock() mock_promote = self.p.promote = Mock() # If we don't match the sync replica we are not allowed to acquire lock self.ha.run_cycle() mock_acquire.assert_not_called() mock_follow.assert_called_once() self.assertEquals(mock_follow.call_args[0][0], None) mock_write_sync.assert_not_called() mock_follow.reset_mock() # If we do match we will try to promote self.ha._is_healthiest_node = true self.ha.cluster = get_cluster_initialized_without_leader( sync=('leader', 'other')) self.ha.run_cycle() mock_acquire.assert_called_once() mock_follow.assert_not_called() mock_promote.assert_called_once() mock_write_sync.assert_called_once_with('other', None, index=0) def test_disable_sync_when_restarting(self): self.ha.is_synchronous_mode = true self.p.name = 'other' self.p.is_leader = false self.p.set_role('replica') mock_restart = self.p.restart = Mock(return_value=True) self.ha.cluster = get_cluster_initialized_with_leader(sync=('leader', 'other')) self.ha.touch_member = Mock(return_value=True) self.ha.dcs.get_cluster = Mock(side_effect=[ get_cluster_initialized_with_leader(sync=('leader', syncstandby)) for syncstandby in ['other', None] ]) with patch('time.sleep') as mock_sleep: self.ha.restart({}) mock_restart.assert_called_once() mock_sleep.assert_called() # Restart is still called when DCS connection fails mock_restart.reset_mock() self.ha.dcs.get_cluster = Mock(side_effect=DCSError("foo")) self.ha.restart({}) mock_restart.assert_called_once() # We don't try to fetch the cluster state when touch_member fails mock_restart.reset_mock() self.ha.dcs.get_cluster.reset_mock() self.ha.touch_member = Mock(return_value=False) self.ha.restart({}) mock_restart.assert_called_once() self.ha.dcs.get_cluster.assert_not_called() def test_effective_tags(self): self.ha._disable_sync = True self.assertEquals(self.ha.get_effective_tags(), { 'foo': 'bar', 'nosync': True }) self.ha._disable_sync = False self.assertEquals(self.ha.get_effective_tags(), {'foo': 'bar'}) def test_restore_cluster_config(self): self.ha.cluster.config.data.clear() self.ha.has_lock = true self.ha.cluster.is_unlocked = false self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') def test_watch(self): self.ha.cluster = get_cluster_initialized_with_leader() self.ha.watch(0) def test_wakup(self): self.ha.wakeup() def test_shutdown(self): self.p.is_running = false self.ha.shutdown() @patch('time.sleep', Mock()) def test_leader_with_empty_directory(self): self.ha.cluster = get_cluster_initialized_with_leader() self.ha.has_lock = true self.p.data_directory_empty = true self.assertEquals( self.ha.run_cycle(), 'released leader key voluntarily as data dir empty and currently leader' ) self.assertEquals(self.p.role, 'uninitialized') # as has_lock is mocked out, we need to fake the leader key release self.ha.has_lock = false # will not say bootstrap from leader as replica can't self elect self.assertEquals(self.ha.run_cycle(), "trying to bootstrap from replica 'other'")
class TestHa(unittest.TestCase): @patch('socket.getaddrinfo', socket_getaddrinfo) @patch.object(etcd.Client, 'read', etcd_read) def setUp(self): with patch.object(etcd.Client, 'machines') as mock_machines: mock_machines.__get__ = Mock(return_value=['http://*****:*****@patch('sys.exit', return_value=1) @patch('patroni.ha.Ha.sysid_valid', MagicMock(return_value=True)) def test_sysid_no_match(self, exit_mock): self.ha.run_cycle() exit_mock.assert_called_once_with(1) @patch.object(Cluster, 'is_unlocked', Mock(return_value=False)) def test_start_as_readonly(self): self.p.is_leader = false self.p.is_healthy = true self.ha.has_lock = true self.assertEquals(self.ha.run_cycle(), 'promoted self to leader because i had the session lock') def test_acquire_lock_as_master(self): self.assertEquals(self.ha.run_cycle(), 'acquired session lock as a leader') def test_promoted_by_acquiring_lock(self): self.ha.is_healthiest_node = true self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') def test_demote_after_failing_to_obtain_lock(self): self.ha.acquire_lock = false self.assertEquals(self.ha.run_cycle(), 'demoted self after trying and failing to obtain lock') def test_follow_new_leader_after_failing_to_obtain_lock(self): self.ha.is_healthiest_node = true self.ha.acquire_lock = false self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'following new leader after trying and failing to obtain lock') def test_demote_because_not_healthiest(self): self.ha.is_healthiest_node = false self.assertEquals(self.ha.run_cycle(), 'demoting self because i am not the healthiest node') def test_follow_new_leader_because_not_healthiest(self): self.ha.is_healthiest_node = false self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'following a different leader because i am not the healthiest node') def test_promote_because_have_lock(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'promoted self to leader because i had the session lock') def test_leader_with_lock(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') def test_demote_because_not_having_lock(self): self.ha.cluster.is_unlocked = false self.assertEquals(self.ha.run_cycle(), 'demoting self because i do not have the lock and i was a leader') def test_demote_because_update_lock_failed(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.ha.update_lock = false self.assertEquals(self.ha.run_cycle(), 'demoting self because i do not have the lock and i was a leader') def test_follow(self): self.ha.cluster.is_unlocked = false self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader') self.ha.patroni.replicatefrom = "foo" self.assertEquals(self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader') def test_no_etcd_connection_master_demote(self): self.ha.load_cluster_from_dcs = Mock(side_effect=DCSError('Etcd is not responding properly')) self.assertEquals(self.ha.run_cycle(), 'demoted self because DCS is not accessible and i was a leader') def test_bootstrap_from_another_member(self): self.ha.cluster = get_cluster_initialized_with_leader() self.assertEquals(self.ha.bootstrap(), 'trying to bootstrap from replica \'other\'') def test_bootstrap_waiting_for_leader(self): self.ha.cluster = get_cluster_initialized_without_leader() self.assertEquals(self.ha.bootstrap(), 'waiting for leader to bootstrap') def test_bootstrap_without_leader(self): self.ha.cluster = get_cluster_initialized_without_leader() self.p.can_create_replica_without_replication_connection = MagicMock(return_value=True) self.assertEquals(self.ha.bootstrap(), 'trying to bootstrap (without leader)') def test_bootstrap_initialize_lock_failed(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.assertEquals(self.ha.bootstrap(), 'failed to acquire initialize lock') def test_bootstrap_initialized_new_cluster(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.e.initialize = true self.assertEquals(self.ha.bootstrap(), 'initialized a new cluster') def test_bootstrap_release_initialize_key_on_failure(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.e.initialize = true self.p.bootstrap = Mock(side_effect=PostgresException("Could not bootstrap master PostgreSQL")) self.assertRaises(PostgresException, self.ha.bootstrap) def test_reinitialize(self): self.ha.schedule_reinitialize() self.ha.schedule_reinitialize() self.ha.run_cycle() self.assertIsNone(self.ha._async_executor.scheduled_action) self.ha.cluster = get_cluster_initialized_with_leader() self.ha.has_lock = true self.ha.schedule_reinitialize() self.ha.run_cycle() self.assertIsNone(self.ha._async_executor.scheduled_action) self.ha.has_lock = false self.ha.schedule_reinitialize() self.ha.run_cycle() def test_restart(self): self.assertEquals(self.ha.restart(), (True, 'restarted successfully')) self.p.restart = false self.assertEquals(self.ha.restart(), (False, 'restart failed')) self.ha.schedule_reinitialize() self.assertEquals(self.ha.restart(), (False, 'reinitialize already in progress')) def test_restart_in_progress(self): self.ha._async_executor.schedule('restart', True) self.assertTrue(self.ha.restart_scheduled()) self.assertEquals(self.ha.run_cycle(), 'not healthy enough for leader race') self.ha.cluster = get_cluster_initialized_with_leader() self.assertEquals(self.ha.run_cycle(), 'restart in progress') self.ha.has_lock = true self.assertEquals(self.ha.run_cycle(), 'updated leader lock during restart') self.ha.update_lock = false self.assertEquals(self.ha.run_cycle(), 'failed to update leader lock during restart') @patch('requests.get', requests_get) @patch('time.sleep', Mock()) def test_manual_failover_from_leader(self): self.ha.has_lock = true self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', '', None)) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, '', self.p.name, None)) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, '', 'blabla', None)) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') f = Failover(0, self.p.name, '', None) self.ha.cluster = get_cluster_initialized_with_leader(f) self.assertEquals(self.ha.run_cycle(), 'manual failover: demoting myself') self.ha.fetch_node_status = lambda e: (e, True, True, 0, {'nofailover': 'True'}) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') # manual failover from the previous leader to us won't happen if we hold the nofailover flag self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, None)) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') # Failover scheduled time must include timezone scheduled = datetime.datetime.now() self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) self.ha.run_cycle() scheduled = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) self.assertEquals('no action. i am the leader with the lock', self.ha.run_cycle()) scheduled = scheduled + datetime.timedelta(seconds=30) self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) self.assertEquals('no action. i am the leader with the lock', self.ha.run_cycle()) scheduled = scheduled + datetime.timedelta(seconds=-600) self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) self.assertEquals('no action. i am the leader with the lock', self.ha.run_cycle()) scheduled = None self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) self.assertEquals('no action. i am the leader with the lock', self.ha.run_cycle()) @patch('requests.get', requests_get) def test_manual_failover_process_no_leader(self): self.p.is_leader = false self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, '', self.p.name, None)) self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, '', 'leader', None)) self.p.set_role('replica') self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') self.ha.fetch_node_status = lambda e: (e, True, True, 0, {}) # accessible, in_recovery self.assertEquals(self.ha.run_cycle(), 'following a different leader because i am not the healthiest node') self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, self.p.name, '', None)) self.assertEquals(self.ha.run_cycle(), 'following a different leader because i am not the healthiest node') self.ha.fetch_node_status = lambda e: (e, False, True, 0, {}) # inaccessible, in_recovery self.p.set_role('replica') self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') # set failover flag to True for all members of the cluster # this should elect the current member, as we are not going to call the API for it. self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, '', 'other', None)) self.ha.fetch_node_status = lambda e: (e, True, True, 0, {'nofailover': 'True'}) # accessible, in_recovery self.p.set_role('replica') self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') # same as previous, but set the current member to nofailover. In no case it should be elected as a leader self.ha.patroni.nofailover = True self.assertEquals(self.ha.run_cycle(), 'following a different leader because I am not allowed to promote') def test_is_healthiest_node(self): self.ha.state_handler.is_leader = false self.ha.patroni.nofailover = False self.ha.fetch_node_status = lambda e: (e, True, True, 0, {}) self.assertTrue(self.ha.is_healthiest_node()) def test__is_healthiest_node(self): self.assertTrue(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.p.is_leader = false self.ha.fetch_node_status = lambda e: (e, True, True, 0, {}) # accessible, in_recovery self.assertTrue(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.fetch_node_status = lambda e: (e, True, False, 0, {}) # accessible, not in_recovery self.assertFalse(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.fetch_node_status = lambda e: (e, True, True, 1, {}) # accessible, in_recovery, xlog location ahead self.assertFalse(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.p.check_replication_lag = false self.assertFalse(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.patroni.nofailover = True self.assertFalse(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.patroni.nofailover = False @patch('requests.get', requests_get) def test_fetch_node_status(self): member = Member(0, 'test', 1, {'api_url': 'http://127.0.0.1:8011/patroni'}) self.ha.fetch_node_status(member) member = Member(0, 'test', 1, {'api_url': 'http://localhost:8011/patroni'}) self.ha.fetch_node_status(member) def test_post_recover(self): self.p.is_running = false self.ha.has_lock = true self.assertEqual(self.ha.post_recover(), 'removed leader key after trying and failing to start postgres') self.ha.has_lock = false self.assertEqual(self.ha.post_recover(), 'failed to start postgres') self.p.is_running = true self.assertIsNone(self.ha.post_recover())
class TestHa(unittest.TestCase): @patch('socket.getaddrinfo', socket_getaddrinfo) @patch('psycopg2.connect', psycopg2_connect) @patch.object(etcd.Client, 'read', etcd_read) def setUp(self): with patch.object(Client, 'machines') as mock_machines: mock_machines.__get__ = Mock( return_value=['http://*****:*****@patch('sys.exit', return_value=1) @patch('patroni.ha.Ha.sysid_valid', MagicMock(return_value=True)) def test_sysid_no_match(self, exit_mock): self.ha.run_cycle() exit_mock.assert_called_once_with(1) @patch.object(Cluster, 'is_unlocked', Mock(return_value=False)) def test_start_as_readonly(self): self.p.is_leader = false self.p.is_healthy = true self.ha.has_lock = true self.assertEquals( self.ha.run_cycle(), 'promoted self to leader because i had the session lock') def test_acquire_lock_as_master(self): self.assertEquals(self.ha.run_cycle(), 'acquired session lock as a leader') def test_promoted_by_acquiring_lock(self): self.ha.is_healthiest_node = true self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') def test_demote_after_failing_to_obtain_lock(self): self.ha.acquire_lock = false self.assertEquals( self.ha.run_cycle(), 'demoted self after trying and failing to obtain lock') def test_follow_new_leader_after_failing_to_obtain_lock(self): self.ha.is_healthiest_node = true self.ha.acquire_lock = false self.p.is_leader = false self.assertEquals( self.ha.run_cycle(), 'following new leader after trying and failing to obtain lock') def test_demote_because_not_healthiest(self): self.ha.is_healthiest_node = false self.assertEquals( self.ha.run_cycle(), 'demoting self because i am not the healthiest node') def test_follow_new_leader_because_not_healthiest(self): self.ha.is_healthiest_node = false self.p.is_leader = false self.assertEquals( self.ha.run_cycle(), 'following a different leader because i am not the healthiest node' ) def test_promote_because_have_lock(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.p.is_leader = false self.assertEquals( self.ha.run_cycle(), 'promoted self to leader because i had the session lock') def test_leader_with_lock(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') def test_demote_because_not_having_lock(self): self.ha.cluster.is_unlocked = false self.assertEquals( self.ha.run_cycle(), 'demoting self because i do not have the lock and i was a leader') def test_demote_because_update_lock_failed(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.ha.update_lock = false self.assertEquals( self.ha.run_cycle(), 'demoted self because failed to update leader lock in DCS') def test_follow(self): self.ha.cluster.is_unlocked = false self.p.is_leader = false self.assertEquals( self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader') self.ha.patroni.replicatefrom = "foo" self.assertEquals( self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader') def test_follow_in_pause(self): self.ha.cluster.is_unlocked = false self.ha.is_paused = true self.assertEquals(self.ha.run_cycle(), 'PAUSE: continue to run as master without lock') self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'PAUSE: no action') def test_no_etcd_connection_master_demote(self): self.ha.load_cluster_from_dcs = Mock( side_effect=DCSError('Etcd is not responding properly')) self.assertEquals( self.ha.run_cycle(), 'demoted self because DCS is not accessible and i was a leader') def test_bootstrap_from_another_member(self): self.ha.cluster = get_cluster_initialized_with_leader() self.assertEquals(self.ha.bootstrap(), 'trying to bootstrap from replica \'other\'') def test_bootstrap_waiting_for_leader(self): self.ha.cluster = get_cluster_initialized_without_leader() self.assertEquals(self.ha.bootstrap(), 'waiting for leader to bootstrap') def test_bootstrap_without_leader(self): self.ha.cluster = get_cluster_initialized_without_leader() self.p.can_create_replica_without_replication_connection = MagicMock( return_value=True) self.assertEquals(self.ha.bootstrap(), 'trying to bootstrap (without leader)') def test_bootstrap_initialize_lock_failed(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.assertEquals(self.ha.bootstrap(), 'failed to acquire initialize lock') def test_bootstrap_initialized_new_cluster(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.e.initialize = true self.assertEquals(self.ha.bootstrap(), 'initialized a new cluster') def test_bootstrap_release_initialize_key_on_failure(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.e.initialize = true self.p.bootstrap = Mock(side_effect=PostgresException( "Could not bootstrap master PostgreSQL")) self.assertRaises(PostgresException, self.ha.bootstrap) def test_reinitialize(self): self.assertIsNotNone(self.ha.reinitialize()) self.ha.cluster = get_cluster_initialized_with_leader() self.assertIsNone(self.ha.reinitialize()) self.assertIsNotNone(self.ha.reinitialize()) self.ha.state_handler.name = self.ha.cluster.leader.name self.assertIsNotNone(self.ha.reinitialize()) def test_restart(self): self.assertEquals(self.ha.restart(), (True, 'restarted successfully')) self.p.restart = false self.assertEquals(self.ha.restart(), (False, 'restart failed')) self.ha.cluster = get_cluster_initialized_with_leader() self.ha.reinitialize() self.assertEquals(self.ha.restart(), (False, 'reinitialize already in progress')) with patch.object(self.ha, "restart_matches", return_value=False): self.assertEquals(self.ha.restart({'foo': 'bar'}), (False, "restart conditions are not satisfied")) def test_restart_in_progress(self): with patch('patroni.async_executor.AsyncExecutor.busy', PropertyMock(return_value=True)): self.ha.restart(run_async=True) self.assertTrue(self.ha.restart_scheduled()) self.assertEquals(self.ha.run_cycle(), 'not healthy enough for leader race') self.ha.cluster = get_cluster_initialized_with_leader() self.assertEquals(self.ha.run_cycle(), 'restart in progress') self.ha.has_lock = true self.assertEquals(self.ha.run_cycle(), 'updated leader lock during restart') self.ha.update_lock = false self.assertEquals(self.ha.run_cycle(), 'failed to update leader lock during restart') @patch('requests.get', requests_get) @patch('time.sleep', Mock()) def test_manual_failover_from_leader(self): self.ha.has_lock = true self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, 'blabla', '', None)) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, '', self.p.name, None)) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, '', 'blabla', None)) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') f = Failover(0, self.p.name, '', None) self.ha.cluster = get_cluster_initialized_with_leader(f) self.assertEquals(self.ha.run_cycle(), 'manual failover: demoting myself') self.ha.fetch_node_status = lambda e: (e, True, True, 0, { 'nofailover': 'True' }) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') # manual failover from the previous leader to us won't happen if we hold the nofailover flag self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, 'blabla', self.p.name, None)) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') # Failover scheduled time must include timezone scheduled = datetime.datetime.now() self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, 'blabla', self.p.name, scheduled)) self.ha.run_cycle() scheduled = datetime.datetime.utcnow().replace(tzinfo=pytz.UTC) self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, 'blabla', self.p.name, scheduled)) self.assertEquals('no action. i am the leader with the lock', self.ha.run_cycle()) scheduled = scheduled + datetime.timedelta(seconds=30) self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, 'blabla', self.p.name, scheduled)) self.assertEquals('no action. i am the leader with the lock', self.ha.run_cycle()) scheduled = scheduled + datetime.timedelta(seconds=-600) self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, 'blabla', self.p.name, scheduled)) self.assertEquals('no action. i am the leader with the lock', self.ha.run_cycle()) scheduled = None self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, 'blabla', self.p.name, scheduled)) self.assertEquals('no action. i am the leader with the lock', self.ha.run_cycle()) @patch('requests.get', requests_get) def test_manual_failover_from_leader_in_pause(self): self.ha.has_lock = true self.ha.is_paused = true scheduled = datetime.datetime.now() self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, 'blabla', self.p.name, scheduled)) self.assertEquals('PAUSE: no action. i am the leader with the lock', self.ha.run_cycle()) self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, self.p.name, '', None)) self.assertEquals('PAUSE: no action. i am the leader with the lock', self.ha.run_cycle()) @patch('requests.get', requests_get) @patch('time.sleep', Mock()) def test_manual_failover_process_no_leader(self): self.p.is_leader = false self.ha.cluster = get_cluster_initialized_without_leader( failover=Failover(0, '', self.p.name, None)) self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') self.ha.cluster = get_cluster_initialized_without_leader( failover=Failover(0, '', 'leader', None)) self.p.set_role('replica') self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') self.ha.fetch_node_status = lambda e: (e, True, True, 0, {} ) # accessible, in_recovery self.assertEquals( self.ha.run_cycle(), 'following a different leader because i am not the healthiest node' ) self.ha.cluster = get_cluster_initialized_without_leader( failover=Failover(0, self.p.name, '', None)) self.assertEquals( self.ha.run_cycle(), 'following a different leader because i am not the healthiest node' ) self.ha.fetch_node_status = lambda e: (e, False, True, 0, {} ) # inaccessible, in_recovery self.p.set_role('replica') self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') # set failover flag to True for all members of the cluster # this should elect the current member, as we are not going to call the API for it. self.ha.cluster = get_cluster_initialized_without_leader( failover=Failover(0, '', 'other', None)) self.ha.fetch_node_status = lambda e: (e, True, True, 0, { 'nofailover': 'True' }) # accessible, in_recovery self.p.set_role('replica') self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') # same as previous, but set the current member to nofailover. In no case it should be elected as a leader self.ha.patroni.nofailover = True self.assertEquals( self.ha.run_cycle(), 'following a different leader because I am not allowed to promote') @patch('time.sleep', Mock()) def test_manual_failover_process_no_leader_in_pause(self): self.ha.is_paused = true self.ha.cluster = get_cluster_initialized_without_leader( failover=Failover(0, '', 'other', None)) self.assertEquals(self.ha.run_cycle(), 'PAUSE: continue to run as master without lock') self.ha.cluster = get_cluster_initialized_without_leader( failover=Failover(0, 'leader', '', None)) self.assertEquals(self.ha.run_cycle(), 'PAUSE: continue to run as master without lock') self.ha.cluster = get_cluster_initialized_without_leader( failover=Failover(0, 'leader', 'blabla', None)) self.assertEquals('PAUSE: acquired session lock as a leader', self.ha.run_cycle()) self.p.is_leader = false self.p.set_role('replica') self.ha.cluster = get_cluster_initialized_without_leader( failover=Failover(0, 'leader', self.p.name, None)) self.assertEquals( self.ha.run_cycle(), 'PAUSE: promoted self to leader by acquiring session lock') def test_is_healthiest_node(self): self.ha.state_handler.is_leader = false self.ha.patroni.nofailover = False self.ha.fetch_node_status = lambda e: (e, True, True, 0, {}) self.assertTrue(self.ha.is_healthiest_node()) self.ha.is_paused = true self.assertFalse(self.ha.is_healthiest_node()) def test__is_healthiest_node(self): self.assertTrue( self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.p.is_leader = false self.ha.fetch_node_status = lambda e: (e, True, True, 0, {} ) # accessible, in_recovery self.assertTrue( self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.fetch_node_status = lambda e: (e, True, False, 0, {} ) # accessible, not in_recovery self.assertFalse( self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.fetch_node_status = lambda e: (e, True, True, 1, { }) # accessible, in_recovery, xlog location ahead self.assertFalse( self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.p.check_replication_lag = false self.assertFalse( self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.patroni.nofailover = True self.assertFalse( self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.patroni.nofailover = False @patch('requests.get', requests_get) def test_fetch_node_status(self): member = Member(0, 'test', 1, {'api_url': 'http://127.0.0.1:8011/patroni'}) self.ha.fetch_node_status(member) member = Member(0, 'test', 1, {'api_url': 'http://localhost:8011/patroni'}) self.ha.fetch_node_status(member) def test_post_recover(self): self.p.is_running = false self.ha.has_lock = true self.assertEqual( self.ha.post_recover(), 'removed leader key after trying and failing to start postgres') self.ha.has_lock = false self.assertEqual(self.ha.post_recover(), 'failed to start postgres') self.p.is_running = true self.assertIsNone(self.ha.post_recover()) def test_schedule_future_restart(self): self.ha.patroni.scheduled_restart = {} # do the restart 2 times. The first one should succeed, the second one should fail self.assertTrue( self.ha.schedule_future_restart({'schedule': future_restart_time})) self.assertFalse( self.ha.schedule_future_restart({'schedule': future_restart_time})) def test_delete_future_restarts(self): self.ha.delete_future_restart() def test_evaluate_scheduled_restart(self): self.p.postmaster_start_time = Mock( return_value=str(postmaster_start_time)) # restart while the postmaster has been already restarted, fails with patch.object( self.ha, 'future_restart_scheduled', Mock( return_value={ 'postmaster_start_time': str(postmaster_start_time - datetime.timedelta(days=1)), 'schedule': str(future_restart_time) })): self.assertIsNone(self.ha.evaluate_scheduled_restart()) with patch.object( self.ha, 'future_restart_scheduled', Mock( return_value={ 'postmaster_start_time': str(postmaster_start_time), 'schedule': str(future_restart_time) })): with patch.object(self.ha, 'should_run_scheduled_action', Mock(return_value=True)): # restart in the future, ok self.assertIsNotNone(self.ha.evaluate_scheduled_restart()) with patch.object(self.ha, 'restart', Mock(return_value=(False, "Test"))): # restart in the future, bit the actual restart failed self.assertIsNone(self.ha.evaluate_scheduled_restart()) def test_scheduled_restart(self): self.ha.cluster = get_cluster_initialized_with_leader() with patch.object(self.ha, "evaluate_scheduled_restart", Mock(return_value="restart scheduled")): self.assertEquals(self.ha.run_cycle(), "restart scheduled") def test_restart_matches(self): self.p._role = 'replica' self.p.server_version = 90500 self.p._pending_restart = True self.assertFalse(self.ha.restart_matches("master", "9.5.0", True)) self.assertFalse(self.ha.restart_matches("replica", "9.4.3", True)) self.p._pending_restart = False self.assertFalse(self.ha.restart_matches("replica", "9.5.2", True)) self.assertTrue(self.ha.restart_matches("replica", "9.5.2", False)) def test_process_healthy_cluster_in_pause(self): self.p.is_leader = false self.ha.is_paused = true self.p.name = 'leader' self.ha.cluster = get_cluster_initialized_with_leader() self.assertEquals( self.ha.run_cycle(), 'PAUSE: removed leader lock because postgres is not running as master' ) self.ha.cluster = get_cluster_initialized_with_leader( Failover(0, '', self.p.name, None)) self.assertEquals(self.ha.run_cycle(), 'PAUSE: waiting to become master after promote...') def test_postgres_unhealthy_in_pause(self): self.ha.is_paused = true self.p.is_healthy = false self.assertEquals(self.ha.run_cycle(), 'PAUSE: postgres is not running') self.ha.has_lock = true self.assertEquals( self.ha.run_cycle(), 'PAUSE: removed leader lock because postgres is not running') def test_no_etcd_connection_in_pause(self): self.ha.is_paused = true self.ha.load_cluster_from_dcs = Mock( side_effect=DCSError('Etcd is not responding properly')) self.assertEquals(self.ha.run_cycle(), 'PAUSE: DCS is not accessible')
class TestHa(unittest.TestCase): @patch('socket.getaddrinfo', socket_getaddrinfo) @patch('psycopg2.connect', psycopg2_connect) @patch('patroni.dcs.dcs_modules', Mock(return_value=['patroni.dcs.foo', 'patroni.dcs.etcd'])) @patch.object(etcd.Client, 'read', etcd_read) def setUp(self): with patch.object(Client, 'machines') as mock_machines: mock_machines.__get__ = Mock(return_value=['http://*****:*****@patch.object(Postgresql, 'fix_cluster_state', Mock()) def test_crash_recovery(self): self.p.is_running = false self.p.controldata = lambda: {'Database cluster state': 'in production'} self.assertEquals(self.ha.run_cycle(), 'doing crash recovery in a single user mode') @patch.object(Postgresql, 'rewind_needed_and_possible', Mock(return_value=True)) def test_recover_with_rewind(self): self.p.is_running = false self.ha.cluster = get_cluster_initialized_with_leader() self.assertEquals(self.ha.run_cycle(), 'running pg_rewind from leader') @patch.object(Postgresql, 'can_rewind', PropertyMock(return_value=True)) @patch.object(Postgresql, 'fix_cluster_state', Mock()) def test_single_user_after_recover_failed(self): self.p.controldata = lambda: {'Database cluster state': 'in recovery'} self.p.is_running = false self.p.follow = false self.assertEquals(self.ha.run_cycle(), 'starting as a secondary') self.assertEquals(self.ha.run_cycle(), 'fixing cluster state in a single user mode') @patch('sys.exit', return_value=1) @patch('patroni.ha.Ha.sysid_valid', MagicMock(return_value=True)) def test_sysid_no_match(self, exit_mock): self.ha.run_cycle() exit_mock.assert_called_once_with(1) @patch.object(Cluster, 'is_unlocked', Mock(return_value=False)) def test_start_as_readonly(self): self.p.is_leader = false self.p.is_healthy = true self.ha.has_lock = true self.p.controldata = lambda: {'Database cluster state': 'in production'} self.assertEquals(self.ha.run_cycle(), 'promoted self to leader because i had the session lock') @patch('psycopg2.connect', psycopg2_connect) def test_acquire_lock_as_master(self): self.assertEquals(self.ha.run_cycle(), 'acquired session lock as a leader') def test_promoted_by_acquiring_lock(self): self.ha.is_healthiest_node = true self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') def test_long_promote(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.p.is_leader = false self.p.set_role('master') self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') def test_demote_after_failing_to_obtain_lock(self): self.ha.acquire_lock = false self.assertEquals(self.ha.run_cycle(), 'demoted self after trying and failing to obtain lock') def test_follow_new_leader_after_failing_to_obtain_lock(self): self.ha.is_healthiest_node = true self.ha.acquire_lock = false self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'following new leader after trying and failing to obtain lock') def test_demote_because_not_healthiest(self): self.ha.is_healthiest_node = false self.assertEquals(self.ha.run_cycle(), 'demoting self because i am not the healthiest node') def test_follow_new_leader_because_not_healthiest(self): self.ha.is_healthiest_node = false self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'following a different leader because i am not the healthiest node') def test_promote_because_have_lock(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'promoted self to leader because i had the session lock') def test_promote_without_watchdog(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.p.is_leader = true with patch.object(Watchdog, 'activate', Mock(return_value=False)): self.assertEquals(self.ha.run_cycle(), 'Demoting self because watchdog could not be activated') self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'Not promoting self because watchdog could not be activated') def test_leader_with_lock(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') def test_demote_because_not_having_lock(self): self.ha.cluster.is_unlocked = false with patch.object(Watchdog, 'is_running', PropertyMock(return_value=True)): self.assertEquals(self.ha.run_cycle(), 'demoting self because i do not have the lock and i was a leader') def test_demote_because_update_lock_failed(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.ha.update_lock = false self.assertEquals(self.ha.run_cycle(), 'demoted self because failed to update leader lock in DCS') self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'not promoting because failed to update leader lock in DCS') def test_follow(self): self.ha.cluster.is_unlocked = false self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader') self.ha.patroni.replicatefrom = "foo" self.assertEquals(self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader') def test_follow_in_pause(self): self.ha.cluster.is_unlocked = false self.ha.is_paused = true self.assertEquals(self.ha.run_cycle(), 'PAUSE: continue to run as master without lock') self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'PAUSE: no action') @patch.object(Postgresql, 'rewind_needed_and_possible', Mock(return_value=True)) def test_follow_triggers_rewind(self): self.p.is_leader = false self.p.trigger_check_diverged_lsn() self.ha.cluster = get_cluster_initialized_with_leader() self.assertEquals(self.ha.run_cycle(), 'running pg_rewind from leader') def test_no_etcd_connection_master_demote(self): self.ha.load_cluster_from_dcs = Mock(side_effect=DCSError('Etcd is not responding properly')) self.assertEquals(self.ha.run_cycle(), 'demoted self because DCS is not accessible and i was a leader') @patch('time.sleep', Mock()) def test_bootstrap_from_another_member(self): self.ha.cluster = get_cluster_initialized_with_leader() self.assertEquals(self.ha.bootstrap(), 'trying to bootstrap from replica \'other\'') def test_bootstrap_waiting_for_leader(self): self.ha.cluster = get_cluster_initialized_without_leader() self.assertEquals(self.ha.bootstrap(), 'waiting for leader to bootstrap') def test_bootstrap_without_leader(self): self.ha.cluster = get_cluster_initialized_without_leader() self.p.can_create_replica_without_replication_connection = MagicMock(return_value=True) self.assertEquals(self.ha.bootstrap(), 'trying to bootstrap (without leader)') def test_bootstrap_initialize_lock_failed(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.assertEquals(self.ha.bootstrap(), 'failed to acquire initialize lock') def test_bootstrap_initialized_new_cluster(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.e.initialize = true self.assertEquals(self.ha.bootstrap(), 'trying to bootstrap a new cluster') self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'waiting for end of recovery after bootstrap') self.p.is_leader = true self.assertEquals(self.ha.run_cycle(), 'running post_bootstrap') self.assertEquals(self.ha.run_cycle(), 'initialized a new cluster') def test_bootstrap_release_initialize_key_on_failure(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.e.initialize = true self.ha.bootstrap() self.p.is_running = false self.assertRaises(PatroniException, self.ha.post_bootstrap) def test_bootstrap_release_initialize_key_on_watchdog_failure(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.e.initialize = true self.ha.bootstrap() self.p.is_running.return_value = MockPostmaster() self.p.is_leader = true with patch.object(Watchdog, 'activate', Mock(return_value=False)): self.assertEquals(self.ha.post_bootstrap(), 'running post_bootstrap') self.assertRaises(PatroniException, self.ha.post_bootstrap) @patch('psycopg2.connect', psycopg2_connect) def test_reinitialize(self): self.assertIsNotNone(self.ha.reinitialize()) self.ha.cluster = get_cluster_initialized_with_leader() self.assertIsNone(self.ha.reinitialize(True)) self.assertIsNotNone(self.ha.reinitialize()) self.ha.state_handler.name = self.ha.cluster.leader.name self.assertIsNotNone(self.ha.reinitialize()) @patch('time.sleep', Mock()) def test_restart(self): self.assertEquals(self.ha.restart({}), (True, 'restarted successfully')) self.p.restart = Mock(return_value=None) self.assertEquals(self.ha.restart({}), (False, 'postgres is still starting')) self.p.restart = false self.assertEquals(self.ha.restart({}), (False, 'restart failed')) self.ha.cluster = get_cluster_initialized_with_leader() self.ha.reinitialize() self.assertEquals(self.ha.restart({}), (False, 'reinitialize already in progress')) with patch.object(self.ha, "restart_matches", return_value=False): self.assertEquals(self.ha.restart({'foo': 'bar'}), (False, "restart conditions are not satisfied")) @patch('os.kill', Mock()) def test_restart_in_progress(self): with patch('patroni.async_executor.AsyncExecutor.busy', PropertyMock(return_value=True)): self.ha.restart({}, run_async=True) self.assertTrue(self.ha.restart_scheduled()) self.assertEquals(self.ha.run_cycle(), 'restart in progress') self.ha.cluster = get_cluster_initialized_with_leader() self.assertEquals(self.ha.run_cycle(), 'restart in progress') self.ha.has_lock = true self.assertEquals(self.ha.run_cycle(), 'updated leader lock during restart') self.ha.update_lock = false self.p.set_role('master') with patch('patroni.async_executor.CriticalTask.cancel', Mock(return_value=False)): with patch('patroni.postgresql.Postgresql.terminate_starting_postmaster') as mock_terminate: self.assertEquals(self.ha.run_cycle(), 'lost leader lock during restart') mock_terminate.assert_called() @patch('requests.get', requests_get) def test_manual_failover_from_leader(self): self.ha.fetch_node_status = get_node_status() self.ha.has_lock = true self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', '', None)) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, '', self.p.name, None)) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, '', 'blabla', None)) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') f = Failover(0, self.p.name, '', None) self.ha.cluster = get_cluster_initialized_with_leader(f) self.assertEquals(self.ha.run_cycle(), 'manual failover: demoting myself') self.p.rewind_needed_and_possible = true self.assertEquals(self.ha.run_cycle(), 'manual failover: demoting myself') self.ha.fetch_node_status = get_node_status(nofailover=True) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') self.ha.fetch_node_status = get_node_status(watchdog_failed=True) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') self.ha.fetch_node_status = get_node_status(wal_position=1) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') # manual failover from the previous leader to us won't happen if we hold the nofailover flag self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, None)) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') # Failover scheduled time must include timezone scheduled = datetime.datetime.now() self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) self.ha.run_cycle() scheduled = datetime.datetime.utcnow().replace(tzinfo=tzutc) self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) self.assertEquals('no action. i am the leader with the lock', self.ha.run_cycle()) scheduled = scheduled + datetime.timedelta(seconds=30) self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) self.assertEquals('no action. i am the leader with the lock', self.ha.run_cycle()) scheduled = scheduled + datetime.timedelta(seconds=-600) self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) self.assertEquals('no action. i am the leader with the lock', self.ha.run_cycle()) scheduled = None self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) self.assertEquals('no action. i am the leader with the lock', self.ha.run_cycle()) @patch('requests.get', requests_get) def test_manual_failover_from_leader_in_pause(self): self.ha.has_lock = true self.ha.is_paused = true scheduled = datetime.datetime.now() self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) self.assertEquals('PAUSE: no action. i am the leader with the lock', self.ha.run_cycle()) self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, self.p.name, '', None)) self.assertEquals('PAUSE: no action. i am the leader with the lock', self.ha.run_cycle()) @patch('requests.get', requests_get) def test_manual_failover_from_leader_in_synchronous_mode(self): self.p.is_leader = true self.ha.has_lock = true self.ha.is_synchronous_mode = true self.ha.is_failover_possible = false self.ha.process_sync_replication = Mock() self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, self.p.name, 'a', None), (self.p.name, None)) self.assertEquals('no action. i am the leader with the lock', self.ha.run_cycle()) self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, self.p.name, 'a', None), (self.p.name, 'a')) self.ha.is_failover_possible = true self.assertEquals('manual failover: demoting myself', self.ha.run_cycle()) @patch('requests.get', requests_get) def test_manual_failover_process_no_leader(self): self.p.is_leader = false self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, '', self.p.name, None)) self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, '', 'leader', None)) self.p.set_role('replica') self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') self.ha.fetch_node_status = get_node_status() # accessible, in_recovery self.assertEquals(self.ha.run_cycle(), 'following a different leader because i am not the healthiest node') self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, self.p.name, '', None)) self.assertEquals(self.ha.run_cycle(), 'following a different leader because i am not the healthiest node') self.ha.fetch_node_status = get_node_status(reachable=False) # inaccessible, in_recovery self.p.set_role('replica') self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') # set failover flag to True for all members of the cluster # this should elect the current member, as we are not going to call the API for it. self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, '', 'other', None)) self.ha.fetch_node_status = get_node_status(nofailover=True) # accessible, in_recovery self.p.set_role('replica') self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') # same as previous, but set the current member to nofailover. In no case it should be elected as a leader self.ha.patroni.nofailover = True self.assertEquals(self.ha.run_cycle(), 'following a different leader because I am not allowed to promote') def test_manual_failover_process_no_leader_in_pause(self): self.ha.is_paused = true self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, '', 'other', None)) self.assertEquals(self.ha.run_cycle(), 'PAUSE: continue to run as master without lock') self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, 'leader', '', None)) self.assertEquals(self.ha.run_cycle(), 'PAUSE: continue to run as master without lock') self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, 'leader', 'blabla', None)) self.assertEquals('PAUSE: acquired session lock as a leader', self.ha.run_cycle()) self.p.is_leader = false self.p.set_role('replica') self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, 'leader', self.p.name, None)) self.assertEquals(self.ha.run_cycle(), 'PAUSE: promoted self to leader by acquiring session lock') def test_is_healthiest_node(self): self.ha.state_handler.is_leader = false self.ha.patroni.nofailover = False self.ha.fetch_node_status = get_node_status() self.assertTrue(self.ha.is_healthiest_node()) with patch.object(Watchdog, 'is_healthy', PropertyMock(return_value=False)): self.assertFalse(self.ha.is_healthiest_node()) with patch('patroni.postgresql.Postgresql.is_starting', return_value=True): self.assertFalse(self.ha.is_healthiest_node()) self.ha.is_paused = true self.assertFalse(self.ha.is_healthiest_node()) def test__is_healthiest_node(self): self.assertTrue(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.p.is_leader = false self.ha.fetch_node_status = get_node_status() # accessible, in_recovery self.assertTrue(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.fetch_node_status = get_node_status(in_recovery=False) # accessible, not in_recovery self.assertFalse(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.fetch_node_status = get_node_status(wal_position=11) # accessible, in_recovery, wal position ahead self.assertFalse(self.ha._is_healthiest_node(self.ha.old_cluster.members)) with patch('patroni.postgresql.Postgresql.timeline_wal_position', return_value=(1, 1)): self.assertFalse(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.patroni.nofailover = True self.assertFalse(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.patroni.nofailover = False @patch('requests.get', requests_get) def test_fetch_node_status(self): member = Member(0, 'test', 1, {'api_url': 'http://127.0.0.1:8011/patroni'}) self.ha.fetch_node_status(member) member = Member(0, 'test', 1, {'api_url': 'http://*****:*****@patch('patroni.ha.Ha.update_lock', return_value=True) @patch('patroni.ha.Ha.demote') def test_starting_timeout(self, demote, update_lock): def check_calls(seq): for mock, called in seq: if called: mock.assert_called_once() else: mock.assert_not_called() mock.reset_mock() self.ha.has_lock = true self.ha.cluster = get_cluster_initialized_with_leader() self.p.check_for_startup = true self.p.time_in_state = lambda: 30 self.assertEquals(self.ha.run_cycle(), 'PostgreSQL is still starting up, 270 seconds until timeout') check_calls([(update_lock, True), (demote, False)]) self.p.time_in_state = lambda: 350 self.ha.fetch_node_status = get_node_status(reachable=False) # inaccessible, in_recovery self.assertEquals(self.ha.run_cycle(), 'master start has timed out, but continuing to wait because failover is not possible') check_calls([(update_lock, True), (demote, False)]) self.ha.fetch_node_status = get_node_status() # accessible, in_recovery self.assertEquals(self.ha.run_cycle(), 'stopped PostgreSQL because of startup timeout') check_calls([(update_lock, True), (demote, True)]) update_lock.return_value = False self.assertEquals(self.ha.run_cycle(), 'stopped PostgreSQL while starting up because leader key was lost') check_calls([(update_lock, True), (demote, True)]) self.ha.has_lock = false self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader') check_calls([(update_lock, False), (demote, False)]) def test_manual_failover_while_starting(self): self.ha.has_lock = true self.p.check_for_startup = true f = Failover(0, self.p.name, '', None) self.ha.cluster = get_cluster_initialized_with_leader(f) self.ha.fetch_node_status = get_node_status() # accessible, in_recovery self.assertEquals(self.ha.run_cycle(), 'manual failover: demoting myself') @patch('patroni.ha.Ha.demote') def test_failover_immediately_on_zero_master_start_timeout(self, demote): self.p.is_running = false self.ha.cluster = get_cluster_initialized_with_leader(sync=(self.p.name, 'other')) self.ha.cluster.config.data['synchronous_mode'] = True self.ha.patroni.config.set_dynamic_configuration({'master_start_timeout': 0}) self.ha.has_lock = true self.ha.update_lock = true self.ha.fetch_node_status = get_node_status() # accessible, in_recovery self.assertEquals(self.ha.run_cycle(), 'stopped PostgreSQL to fail over after a crash') demote.assert_called_once() @patch('patroni.postgresql.Postgresql.follow') def test_demote_immediate(self, follow): self.ha.has_lock = true self.e.get_cluster = Mock(return_value=get_cluster_initialized_without_leader()) self.ha.demote('immediate') follow.assert_called_once_with(None) def test_process_sync_replication(self): self.ha.has_lock = true mock_set_sync = self.p.set_synchronous_standby = Mock() self.p.name = 'leader' # Test sync key removed when sync mode disabled self.ha.cluster = get_cluster_initialized_with_leader(sync=('leader', 'other')) with patch.object(self.ha.dcs, 'delete_sync_state') as mock_delete_sync: self.ha.run_cycle() mock_delete_sync.assert_called_once() mock_set_sync.assert_called_once_with(None) mock_set_sync.reset_mock() # Test sync key not touched when not there self.ha.cluster = get_cluster_initialized_with_leader() with patch.object(self.ha.dcs, 'delete_sync_state') as mock_delete_sync: self.ha.run_cycle() mock_delete_sync.assert_not_called() mock_set_sync.assert_called_once_with(None) mock_set_sync.reset_mock() self.ha.is_synchronous_mode = true # Test sync standby not touched when picking the same node self.p.pick_synchronous_standby = Mock(return_value=('other', True)) self.ha.cluster = get_cluster_initialized_with_leader(sync=('leader', 'other')) self.ha.run_cycle() mock_set_sync.assert_not_called() mock_set_sync.reset_mock() # Test sync standby is replaced when switching standbys self.p.pick_synchronous_standby = Mock(return_value=('other2', False)) self.ha.dcs.write_sync_state = Mock(return_value=True) self.ha.run_cycle() mock_set_sync.assert_called_once_with('other2') mock_set_sync.reset_mock() # Test sync standby is not disabled when updating dcs fails self.ha.dcs.write_sync_state = Mock(return_value=False) self.ha.run_cycle() mock_set_sync.assert_not_called() mock_set_sync.reset_mock() # Test changing sync standby self.ha.dcs.write_sync_state = Mock(return_value=True) self.ha.dcs.get_cluster = Mock(return_value=get_cluster_initialized_with_leader(sync=('leader', 'other'))) # self.ha.cluster = get_cluster_initialized_with_leader(sync=('leader', 'other')) self.p.pick_synchronous_standby = Mock(return_value=('other2', True)) self.ha.run_cycle() self.ha.dcs.get_cluster.assert_called_once() self.assertEquals(self.ha.dcs.write_sync_state.call_count, 2) # Test updating sync standby key failed due to race self.ha.dcs.write_sync_state = Mock(side_effect=[True, False]) self.ha.run_cycle() self.assertEquals(self.ha.dcs.write_sync_state.call_count, 2) # Test changing sync standby failed due to race self.ha.dcs.write_sync_state = Mock(return_value=True) self.ha.dcs.get_cluster = Mock(return_value=get_cluster_initialized_with_leader(sync=('somebodyelse', None))) self.ha.run_cycle() self.assertEquals(self.ha.dcs.write_sync_state.call_count, 1) # Test sync set to '*' when synchronous_mode_strict is enabled mock_set_sync.reset_mock() self.ha.is_synchronous_mode_strict = true self.p.pick_synchronous_standby = Mock(return_value=(None, False)) self.ha.run_cycle() mock_set_sync.assert_called_once_with('*') def test_sync_replication_become_master(self): self.ha.is_synchronous_mode = true mock_set_sync = self.p.set_synchronous_standby = Mock() self.p.is_leader = false self.p.set_role('replica') self.ha.has_lock = true mock_write_sync = self.ha.dcs.write_sync_state = Mock(return_value=True) self.p.name = 'leader' self.ha.cluster = get_cluster_initialized_with_leader(sync=('other', None)) # When we just became master nobody is sync self.assertEquals(self.ha.enforce_master_role('msg', 'promote msg'), 'promote msg') mock_set_sync.assert_called_once_with(None) mock_write_sync.assert_called_once_with('leader', None, index=0) mock_set_sync.reset_mock() # When we just became master nobody is sync self.p.set_role('replica') mock_write_sync.return_value = False self.assertTrue(self.ha.enforce_master_role('msg', 'promote msg') != 'promote msg') mock_set_sync.assert_not_called() def test_unhealthy_sync_mode(self): self.ha.is_synchronous_mode = true self.p.is_leader = false self.p.set_role('replica') self.p.name = 'other' self.ha.cluster = get_cluster_initialized_without_leader(sync=('leader', 'other2')) mock_write_sync = self.ha.dcs.write_sync_state = Mock(return_value=True) mock_acquire = self.ha.acquire_lock = Mock(return_value=True) mock_follow = self.p.follow = Mock() mock_promote = self.p.promote = Mock() # If we don't match the sync replica we are not allowed to acquire lock self.ha.run_cycle() mock_acquire.assert_not_called() mock_follow.assert_called_once() self.assertEquals(mock_follow.call_args[0][0], None) mock_write_sync.assert_not_called() mock_follow.reset_mock() # If we do match we will try to promote self.ha._is_healthiest_node = true self.ha.cluster = get_cluster_initialized_without_leader(sync=('leader', 'other')) self.ha.run_cycle() mock_acquire.assert_called_once() mock_follow.assert_not_called() mock_promote.assert_called_once() mock_write_sync.assert_called_once_with('other', None, index=0) def test_disable_sync_when_restarting(self): self.ha.is_synchronous_mode = true self.p.name = 'other' self.p.is_leader = false self.p.set_role('replica') mock_restart = self.p.restart = Mock(return_value=True) self.ha.cluster = get_cluster_initialized_with_leader(sync=('leader', 'other')) self.ha.touch_member = Mock(return_value=True) self.ha.dcs.get_cluster = Mock(side_effect=[ get_cluster_initialized_with_leader(sync=('leader', syncstandby)) for syncstandby in ['other', None]]) with patch('time.sleep') as mock_sleep: self.ha.restart({}) mock_restart.assert_called_once() mock_sleep.assert_called() # Restart is still called when DCS connection fails mock_restart.reset_mock() self.ha.dcs.get_cluster = Mock(side_effect=DCSError("foo")) self.ha.restart({}) mock_restart.assert_called_once() # We don't try to fetch the cluster state when touch_member fails mock_restart.reset_mock() self.ha.dcs.get_cluster.reset_mock() self.ha.touch_member = Mock(return_value=False) self.ha.restart({}) mock_restart.assert_called_once() self.ha.dcs.get_cluster.assert_not_called() def test_effective_tags(self): self.ha._disable_sync = True self.assertEquals(self.ha.get_effective_tags(), {'foo': 'bar', 'nosync': True}) self.ha._disable_sync = False self.assertEquals(self.ha.get_effective_tags(), {'foo': 'bar'}) def test_restore_cluster_config(self): self.ha.cluster.config.data.clear() self.ha.has_lock = true self.ha.cluster.is_unlocked = false self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') def test_watch(self): self.ha.cluster = get_cluster_initialized_with_leader() self.ha.watch(0) def test_wakup(self): self.ha.wakeup() def test_shutdown(self): self.p.is_running = false self.ha.has_lock = true self.ha.shutdown() @patch('time.sleep', Mock()) def test_leader_with_empty_directory(self): self.ha.cluster = get_cluster_initialized_with_leader() self.ha.has_lock = true self.p.data_directory_empty = true self.assertEquals(self.ha.run_cycle(), 'released leader key voluntarily as data dir empty and currently leader') self.assertEquals(self.p.role, 'uninitialized') # as has_lock is mocked out, we need to fake the leader key release self.ha.has_lock = false # will not say bootstrap from leader as replica can't self elect self.assertEquals(self.ha.run_cycle(), "trying to bootstrap from replica 'other'") def test_update_cluster_history(self): self.p.get_master_timeline = Mock(return_value=1) self.ha.has_lock = true self.ha.cluster.is_unlocked = false self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock')
class TestHa(unittest.TestCase): @patch('socket.getaddrinfo', socket_getaddrinfo) @patch.object(Client, 'machines') def setUp(self, mock_machines): mock_machines.__get__ = Mock(return_value=['http://*****:*****@patch.object(Cluster, 'is_unlocked', Mock(return_value=False)) def test_start_as_readonly(self): self.p.is_leader = self.p.is_healthy = false self.ha.has_lock = true self.assertEquals(self.ha.run_cycle(), 'promoted self to leader because i had the session lock') def test_acquire_lock_as_master(self): self.assertEquals(self.ha.run_cycle(), 'acquired session lock as a leader') def test_promoted_by_acquiring_lock(self): self.ha.is_healthiest_node = true self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') def test_demote_after_failing_to_obtain_lock(self): self.ha.acquire_lock = false self.assertEquals(self.ha.run_cycle(), 'demoted self due after trying and failing to obtain lock') def test_follow_new_leader_after_failing_to_obtain_lock(self): self.ha.is_healthiest_node = true self.ha.acquire_lock = false self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'following new leader after trying and failing to obtain lock') def test_demote_because_not_healthiest(self): self.ha.is_healthiest_node = false self.assertEquals(self.ha.run_cycle(), 'demoting self because i am not the healthiest node') def test_follow_new_leader_because_not_healthiest(self): self.ha.is_healthiest_node = false self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'following a different leader because i am not the healthiest node') def test_promote_because_have_lock(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'promoted self to leader because i had the session lock') def test_leader_with_lock(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') def test_demote_because_not_having_lock(self): self.ha.cluster.is_unlocked = false self.assertEquals(self.ha.run_cycle(), 'demoting self because i do not have the lock and i was a leader') def test_demote_because_update_lock_failed(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.ha.update_lock = false self.assertEquals(self.ha.run_cycle(), 'demoting self because i do not have the lock and i was a leader') def test_follow_the_leader(self): self.ha.cluster.is_unlocked = false self.p.is_leader = false self.assertEquals(self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader') def test_no_etcd_connection_master_demote(self): self.ha.load_cluster_from_dcs = Mock(side_effect=DCSError('Etcd is not responding properly')) self.assertEquals(self.ha.run_cycle(), 'demoted self because DCS is not accessible and i was a leader') def test_bootstrap_from_leader(self): self.ha.cluster = get_cluster_initialized_with_leader() self.p.bootstrap = false self.assertEquals(self.ha.bootstrap(), 'trying to bootstrap from leader') def test_bootstrap_waiting_for_leader(self): self.ha.cluster = get_cluster_initialized_without_leader() self.assertEquals(self.ha.bootstrap(), 'waiting for leader to bootstrap') def test_bootstrap_initialize_lock_failed(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.assertEquals(self.ha.bootstrap(), 'failed to acquire initialize lock') def test_bootstrap_initialized_new_cluster(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.e.initialize = true self.assertEquals(self.ha.bootstrap(), 'initialized a new cluster') def test_bootstrap_release_initialize_key_on_failure(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.e.initialize = true self.p.bootstrap = Mock(side_effect=PostgresException("Could not bootstrap master PostgreSQL")) self.assertRaises(PostgresException, self.ha.bootstrap) def test_reinitialize(self): self.ha.schedule_reinitialize() self.ha.schedule_reinitialize() self.ha.run_cycle() self.assertIsNone(self.ha._async_executor.scheduled_action) self.ha.cluster = get_cluster_initialized_with_leader() self.ha.has_lock = true self.ha.schedule_reinitialize() self.ha.run_cycle() self.assertIsNone(self.ha._async_executor.scheduled_action) self.ha.has_lock = false self.ha.schedule_reinitialize() self.ha.run_cycle() def test_restart(self): self.assertEquals(self.ha.restart(), (True, 'restarted successfully')) self.p.restart = false self.assertEquals(self.ha.restart(), (False, 'restart failed')) self.ha.schedule_reinitialize() self.assertEquals(self.ha.restart(), (False, 'reinitialize already in progress')) def test_restart_in_progress(self): self.ha._async_executor.schedule('restart', True) self.assertTrue(self.ha.restart_scheduled()) self.assertEquals(self.ha.run_cycle(), 'not healthy enough for leader race') self.ha.cluster = get_cluster_initialized_with_leader() self.assertEquals(self.ha.run_cycle(), 'restart in progress') self.ha.has_lock = true self.assertEquals(self.ha.run_cycle(), 'updated leader lock during restart') self.ha.update_lock = false self.assertEquals(self.ha.run_cycle(), 'failed to update leader lock during restart') @patch('requests.get', requests_get) def test_manual_failover_from_leader(self): self.ha.has_lock = true self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', '')) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, '', MockPostgresql.name)) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, '', 'blabla')) self.assertEquals(self.ha.run_cycle(), 'no action. i am the leader with the lock') f = Failover(0, MockPostgresql.name, '') self.ha.cluster = get_cluster_initialized_with_leader(f) self.assertEquals(self.ha.run_cycle(), 'manual failover: demoting myself') @patch('requests.get', requests_get) def test_manual_failover_process_no_leader(self): self.p.is_leader = false self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, '', MockPostgresql.name)) self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, '', 'leader')) self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') self.ha.fetch_node_status = lambda e: (e, True, True, 0) # accessible, in_recovery self.assertEquals(self.ha.run_cycle(), 'following a different leader because i am not the healthiest node') self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, MockPostgresql.name, '')) self.assertEquals(self.ha.run_cycle(), 'following a different leader because i am not the healthiest node') self.ha.fetch_node_status = lambda e: (e, False, True, 0) # accessible, in_recovery self.assertEquals(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') def test__is_healthiest_node(self): self.assertTrue(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.p.is_leader = false self.ha.fetch_node_status = lambda e: (e, True, True, 0) # accessible, in_recovery self.assertTrue(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.fetch_node_status = lambda e: (e, True, False, 0) # accessible, not in_recovery self.assertFalse(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.fetch_node_status = lambda e: (e, True, True, 1) # accessible, in_recovery, xlog location ahead self.assertFalse(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.p.check_replication_lag = false self.assertFalse(self.ha._is_healthiest_node(self.ha.old_cluster.members)) @patch('requests.get', requests_get) def test_fetch_node_status(self): member = Member(0, 'test', 1, {'api_url': 'http://127.0.0.1:8011/patroni'}) self.ha.fetch_node_status(member) member = Member(0, 'test', 1, {'api_url': 'http://localhost:8011/patroni'}) self.ha.fetch_node_status(member)