def test_do_POST_failover(self, dcs): cluster = dcs.get_cluster.return_value post = 'POST /failover HTTP/1.0' + self._authorization + '\nContent-Length: ' MockRestApiServer(RestApiHandler, post + '7\n\n{"1":2}') request = post + '0\n\n' MockRestApiServer(RestApiHandler, request) cluster.leader.name = 'postgresql1' MockRestApiServer(RestApiHandler, request) MockRestApiServer(RestApiHandler, post + '25\n\n{"leader": "postgresql1"}') cluster.leader.name = 'postgresql2' request = post + '53\n\n{"leader": "postgresql1", "candidate": "postgresql2"}' MockRestApiServer(RestApiHandler, request) cluster.leader.name = 'postgresql1' MockRestApiServer(RestApiHandler, request) cluster.members = [Member(0, 'postgresql0', 30, {'api_url': 'http'}), Member(0, 'postgresql2', 30, {'api_url': 'http'})] MockRestApiServer(RestApiHandler, request) with patch.object(MockPatroni, 'dcs') as d: cluster = d.get_cluster.return_value cluster.leader.name = 'postgresql0' MockRestApiServer(RestApiHandler, request) cluster.leader.name = 'postgresql2' MockRestApiServer(RestApiHandler, request) cluster.leader.name = 'postgresql1' cluster.failover = None MockRestApiServer(RestApiHandler, request) d.get_cluster = Mock(side_effect=Exception) MockRestApiServer(RestApiHandler, request) d.manual_failover.return_value = False MockRestApiServer(RestApiHandler, request) with patch.object(MockHa, 'fetch_nodes_statuses', Mock(return_value=[])): MockRestApiServer(RestApiHandler, request) # Valid future date request = post + '103\n\n{"leader": "postgresql1", "member": "postgresql2",' +\ ' "scheduled_at": "6016-02-15T18:13:30.568224+01:00"}' MockRestApiServer(RestApiHandler, request) with patch.object(MockPatroni, 'dcs') as d: d.manual_failover.return_value = False MockRestApiServer(RestApiHandler, request) # Exception: No timezone specified request = post + '97\n\n{"leader": "postgresql1", "member": "postgresql2",' +\ ' "scheduled_at": "6016-02-15T18:13:30.568224"}' MockRestApiServer(RestApiHandler, request) # Exception: Scheduled in the past request = post + '103\n\n{"leader": "postgresql1", "member": "postgresql2", "scheduled_at": "' MockRestApiServer(RestApiHandler, request + '1016-02-15T18:13:30.568224+01:00"}') # Invalid date self.assertIsNotNone(MockRestApiServer(RestApiHandler, request + '2010-02-29T18:13:30.568224+01:00"}'))
def test_sync_replication_slots(self): self.p.start() cluster = Cluster(True, None, self.leader, 0, [self.me, self.other, self.leadermem], None, None) with mock.patch('patroni.postgresql.Postgresql._query', Mock(side_effect=psycopg2.OperationalError)): self.p.sync_replication_slots(cluster) self.p.sync_replication_slots(cluster) with mock.patch('patroni.postgresql.Postgresql.role', new_callable=PropertyMock(return_value='replica')): self.p.sync_replication_slots(cluster) with mock.patch('patroni.postgresql.logger.error', new_callable=Mock()) as errorlog_mock: self.p.query = Mock() alias1 = Member( 0, 'test-3', 28, { 'conn_url': 'postgres://*****:*****@127.0.0.1:5436/postgres' }) alias2 = Member( 0, 'test.3', 28, { 'conn_url': 'postgres://*****:*****@127.0.0.1:5436/postgres' }) cluster.members.extend([alias1, alias2]) self.p.sync_replication_slots(cluster) errorlog_mock.assert_called_once() assert "test-3" in errorlog_mock.call_args[0][1] assert "test.3" in errorlog_mock.call_args[0][1]
def get_cluster_initialized_without_leader(leader=False, failover=None): m1 = Member(0, 'leader', 28, {'conn_url': 'postgres://*****:*****@127.0.0.1:5435/postgres', 'api_url': 'http://127.0.0.1:8008/patroni', 'xlog_location': 4}) l = Leader(0, 0, m1) if leader else None m2 = Member(0, 'other', 28, {'conn_url': 'postgres://*****:*****@127.0.0.1:5436/postgres', 'api_url': 'http://127.0.0.1:8011/patroni', 'tags': {'clonefrom': True}}) return get_cluster(True, l, [m1, m2], failover)
def get_cluster_initialized_without_leader(leader=False, failover=None, sync=None): m1 = Member( 0, 'leader', 28, { 'conn_url': 'postgres://*****:*****@127.0.0.1:5435/postgres', 'api_url': 'http://127.0.0.1:8008/patroni', 'xlog_location': 4 }) leader = Leader(0, 0, m1) if leader else None m2 = Member( 0, 'other', 28, { 'conn_url': 'postgres://*****:*****@127.0.0.1:5436/postgres', 'api_url': 'http://127.0.0.1:8011/patroni', 'state': 'running', 'tags': { 'clonefrom': True }, 'scheduled_restart': { 'schedule': "2100-01-01 10:53:07.560445+00:00", 'postgres_version': '99.0.0' } }) syncstate = SyncState(0 if sync else None, sync and sync[0], sync and sync[1]) return get_cluster(True, leader, [m1, m2], failover, syncstate)
def test_sync_replication_slots(self): config = ClusterConfig(1, {'slots': {'test_3': {'database': 'a', 'plugin': 'b'}, 'A': 0, 'ls': 0, 'b': {'type': 'logical', 'plugin': '1'}}, 'ignore_slots': [{'name': 'blabla'}]}, 1) cluster = Cluster(True, config, self.leader, 0, [self.me, self.other, self.leadermem], None, None, None, {'test_3': 10}) with mock.patch('patroni.postgresql.Postgresql._query', Mock(side_effect=psycopg.OperationalError)): self.s.sync_replication_slots(cluster, False) self.p.set_role('standby_leader') self.s.sync_replication_slots(cluster, False) self.p.set_role('replica') with patch.object(Postgresql, 'is_leader', Mock(return_value=False)): self.s.sync_replication_slots(cluster, False) self.p.set_role('master') with mock.patch('patroni.postgresql.Postgresql.role', new_callable=PropertyMock(return_value='replica')): self.s.sync_replication_slots(cluster, False) with patch.object(SlotsHandler, 'drop_replication_slot', Mock(return_value=True)),\ patch('patroni.dcs.logger.error', new_callable=Mock()) as errorlog_mock: alias1 = Member(0, 'test-3', 28, {'conn_url': 'postgres://*****:*****@127.0.0.1:5436/postgres'}) alias2 = Member(0, 'test.3', 28, {'conn_url': 'postgres://*****:*****@127.0.0.1:5436/postgres'}) cluster.members.extend([alias1, alias2]) self.s.sync_replication_slots(cluster, False) self.assertEqual(errorlog_mock.call_count, 5) ca = errorlog_mock.call_args_list[0][0][1] self.assertTrue("test-3" in ca, "non matching {0}".format(ca)) self.assertTrue("test.3" in ca, "non matching {0}".format(ca)) with patch.object(Postgresql, 'major_version', PropertyMock(return_value=90618)): self.s.sync_replication_slots(cluster, False)
def test_fetch_node_status(self): member = Member(0, 'test', 1, {'api_url': 'http://127.0.0.1:8011/patroni'}) self.ha.fetch_node_status(member) member = Member(0, 'test', 1, {'api_url': 'http://localhost:8011/patroni'}) self.ha.fetch_node_status(member)
def setUp(self): self.data_dir = 'data/test0' self.config_dir = self.data_dir if not os.path.exists(self.data_dir): os.makedirs(self.data_dir) self.p = Postgresql({ 'name': 'test0', 'scope': 'batman', 'data_dir': self.data_dir, 'config_dir': self.config_dir, 'retry_timeout': 10, 'pgpass': '******', 'listen': '127.0.0.2, 127.0.0.3:5432', 'connect_address': '127.0.0.2:5432', 'authentication': { 'superuser': { 'username': '******', 'password': '******' }, 'replication': { 'username': '******', 'password': '******' } }, 'remove_data_directory_on_rewind_failure': True, 'use_pg_rewind': True, 'pg_ctl_timeout': 'bla', 'parameters': self._PARAMETERS, 'recovery_conf': { 'foo': 'bar' }, 'pg_hba': ['host all all 0.0.0.0/0 md5'], 'callbacks': { 'on_start': 'true', 'on_stop': 'true', 'on_reload': 'true', 'on_restart': 'true', 'on_role_change': 'true' } }) self.p._callback_executor = Mock() self.leadermem = Member(0, 'leader', 28, { 'conn_url': 'postgres://*****:*****@127.0.0.1:5435/postgres' }) self.leader = Leader(-1, 28, self.leadermem) self.other = Member( 0, 'test-1', 28, { 'conn_url': 'postgres://*****:*****@127.0.0.1:5433/postgres', 'tags': { 'replicatefrom': 'leader' } }) self.me = Member(0, 'test0', 28, { 'conn_url': 'postgres://*****:*****@127.0.0.1:5434/postgres' })
def setUp(self): super(BaseTestPostgresql, self).setUp() if not os.path.exists(self.p.data_dir): os.makedirs(self.p.data_dir) self.leadermem = Member(0, 'leader', 28, {'conn_url': 'postgres://*****:*****@127.0.0.1:5435/postgres'}) self.leader = Leader(-1, 28, self.leadermem) self.other = Member(0, 'test-1', 28, {'conn_url': 'postgres://*****:*****@127.0.0.1:5433/postgres', 'tags': {'replicatefrom': 'leader'}}) self.me = Member(0, 'test0', 28, {'conn_url': 'postgres://*****:*****@127.0.0.1:5434/postgres'})
def _load_cluster(self): try: path = self.client_path('/') _, results = self._client.kv.get(path, recurse=True) if results is None: raise NotFound nodes = {} for node in results: node['Value'] = (node['Value'] or b'').decode('utf-8') nodes[os.path.relpath(node['Key'], path)] = node # get initialize flag initialize = nodes.get(self._INITIALIZE) initialize = initialize and initialize['Value'] # get last leader operation last_leader_operation = nodes.get(self._LEADER_OPTIME) last_leader_operation = 0 if last_leader_operation is None else int( last_leader_operation['Value']) # get list of members members = [ self.member(n) for k, n in nodes.items() if k.startswith(self._MEMBERS) and k.count('/') == 1 ] # get leader leader = nodes.get(self._LEADER) if leader and leader[ 'Value'] == self._name and self._session != leader.get( 'Session', 'x'): logger.info( 'I am leader but not owner of the session. Removing leader node' ) self._client.kv.delete(self.leader_path, cas=leader['ModifyIndex']) leader = None if leader: member = Member(-1, leader['Value'], None, {}) member = ([m for m in members if m.name == leader['Value']] or [member])[0] leader = Leader(leader['ModifyIndex'], leader.get('Session'), member) # failover key failover = nodes.get(self._FAILOVER) if failover: failover = Failover.from_node(failover['ModifyIndex'], failover['Value']) self._cluster = Cluster(initialize, leader, last_leader_operation, members, failover) except NotFound: self._cluster = Cluster(False, None, None, [], None) except: logger.exception('get_cluster') raise ConsulError('Consul is not responding properly')
def member(pod): annotations = pod.metadata.annotations or {} member = Member.from_node(pod.metadata.resource_version, pod.metadata.name, None, annotations.get('status', '')) member.data['pod_labels'] = pod.metadata.labels return member
def test_do_POST_failover(self, dcs): cluster = dcs.get_cluster.return_value request = b'POST /failover HTTP/1.0\nAuthorization: Basic dGVzdDp0ZXN0\n' +\ b'Content-Length: 25\n\n{"leader": "postgresql1"}' MockRestApiServer(RestApiHandler, request) cluster.leader.name = 'postgresql1' MockRestApiServer(RestApiHandler, request) cluster.members = [Member(0, 'postgresql0', 30, {'api_url': 'http'})] MockRestApiServer(RestApiHandler, request) with patch.object(MockPatroni, 'dcs') as d: cluster = d.get_cluster.return_value cluster.leader.name = 'postgresql0' MockRestApiServer(RestApiHandler, request) cluster.leader.name = 'postgresql1' cluster.failover = None MockRestApiServer(RestApiHandler, request) d.get_cluster = Mock(side_effect=Exception()) MockRestApiServer(RestApiHandler, request) d.manual_failover.return_value = False MockRestApiServer(RestApiHandler, request) with patch.object(MockHa, 'fetch_nodes_statuses', Mock(return_value=[])): MockRestApiServer(RestApiHandler, request) request = b'POST /failover HTTP/1.0\nAuthorization: Basic dGVzdDp0ZXN0\n' +\ b'Content-Length: 50\n\n{"leader": "postgresql1", "member": "postgresql2"}' MockRestApiServer(RestApiHandler, request)
def _load_cluster(self): try: path = self.client_path('/') _, results = self.retry(self._client.kv.get, path, recurse=True) if results is None: raise NotFound nodes = {} for node in results: node['Value'] = (node['Value'] or b'').decode('utf-8') nodes[node['Key'][len(path):].lstrip('/')] = node # get initialize flag initialize = nodes.get(self._INITIALIZE) initialize = initialize and initialize['Value'] # get global dynamic configuration config = nodes.get(self._CONFIG) config = config and ClusterConfig.from_node(config['ModifyIndex'], config['Value']) # get timeline history history = nodes.get(self._HISTORY) history = history and TimelineHistory.from_node(history['ModifyIndex'], history['Value']) # get last leader operation last_leader_operation = nodes.get(self._LEADER_OPTIME) last_leader_operation = 0 if last_leader_operation is None else int(last_leader_operation['Value']) # get list of members members = [self.member(n) for k, n in nodes.items() if k.startswith(self._MEMBERS) and k.count('/') == 1] # get leader leader = nodes.get(self._LEADER) if not self._ctl and leader and leader['Value'] == self._name \ and self._session != leader.get('Session', 'x'): logger.info('I am leader but not owner of the session. Removing leader node') self._client.kv.delete(self.leader_path, cas=leader['ModifyIndex']) leader = None if leader: member = Member(-1, leader['Value'], None, {}) member = ([m for m in members if m.name == leader['Value']] or [member])[0] leader = Leader(leader['ModifyIndex'], leader.get('Session'), member) # failover key failover = nodes.get(self._FAILOVER) if failover: failover = Failover.from_node(failover['ModifyIndex'], failover['Value']) # get synchronization state sync = nodes.get(self._SYNC) sync = SyncState.from_node(sync and sync['ModifyIndex'], sync and sync['Value']) self._cluster = Cluster(initialize, config, leader, last_leader_operation, members, failover, sync, history) except NotFound: self._cluster = Cluster(None, None, None, None, [], None, None, None) except Exception: logger.exception('get_cluster') raise ConsulError('Consul is not responding properly')
def _load_cluster(self): try: result = self.retry(self._client.read, self.client_path(''), recursive=True) nodes = {os.path.relpath(node.key, result.key): node for node in result.leaves} # get initialize flag initialize = nodes.get(self._INITIALIZE) initialize = initialize and initialize.value # get last leader operation last_leader_operation = nodes.get(self._LEADER_OPTIME) last_leader_operation = 0 if last_leader_operation is None else int(last_leader_operation.value) # get list of members members = [self.member(n) for k, n in nodes.items() if k.startswith(self._MEMBERS) and k.count('/') == 1] # get leader leader = nodes.get(self._LEADER) if leader: member = Member(-1, leader.value, None, {}) member = ([m for m in members if m.name == leader.value] or [member])[0] leader = Leader(leader.modifiedIndex, leader.ttl, member) # failover key failover = nodes.get(self._FAILOVER) if failover: failover = Failover.from_node(failover.modifiedIndex, failover.value) self._cluster = Cluster(initialize, leader, last_leader_operation, members, failover) except etcd.EtcdKeyNotFound: self._cluster = Cluster(False, None, None, [], None) except: logger.exception('get_cluster') raise EtcdError('Etcd is not responding properly')
def _load_cluster(self): try: result = self.retry(self._client.read, self.client_path(''), recursive=True) nodes = { os.path.relpath(node.key, result.key): node for node in result.leaves } # get initialize flag initialize = nodes.get(self._INITIALIZE) initialize = initialize and initialize.value # get global dynamic configuration config = nodes.get(self._CONFIG) config = config and ClusterConfig.from_node( config.modifiedIndex, config.value) # get last leader operation last_leader_operation = nodes.get(self._LEADER_OPTIME) last_leader_operation = 0 if last_leader_operation is None else int( last_leader_operation.value) # get list of members members = [ self.member(n) for k, n in nodes.items() if k.startswith(self._MEMBERS) and k.count('/') == 1 ] # get leader leader = nodes.get(self._LEADER) if leader: member = Member(-1, leader.value, None, {}) member = ([m for m in members if m.name == leader.value] or [member])[0] index = result.etcd_index if result.etcd_index > leader.modifiedIndex else leader.modifiedIndex + 1 leader = Leader(index, leader.ttl, member) # failover key failover = nodes.get(self._FAILOVER) if failover: failover = Failover.from_node(failover.modifiedIndex, failover.value) # get synchronization state sync = nodes.get(self._SYNC) sync = SyncState.from_node(sync and sync.modifiedIndex, sync and sync.value) self._cluster = Cluster(initialize, config, leader, last_leader_operation, members, failover, sync) except etcd.EtcdKeyNotFound: self._cluster = Cluster(None, None, None, None, [], None, None) except: logger.exception('get_cluster') raise EtcdError('Etcd is not responding properly')
def _inner_load_cluster(self): self._fetch_cluster = False self.event.clear() nodes = set( self.get_children(self.client_path(''), self.cluster_watcher)) if not nodes: self._fetch_cluster = True # get initialize flag initialize = (self.get_node(self.initialize_path) or [None])[0] if self._INITIALIZE in nodes else None # get global dynamic configuration config = self.get_node( self.config_path, watch=self.cluster_watcher) if self._CONFIG in nodes else None config = config and ClusterConfig.from_node(config[1].version, config[0], config[1].mzxid) # get last leader operation last_leader_operation = self._OPTIME in nodes and self._fetch_cluster and self.get_node( self.leader_optime_path) last_leader_operation = last_leader_operation and int( last_leader_operation[0]) or 0 # get list of members members = self.load_members() if self._MEMBERS[:-1] in nodes else [] # get leader leader = self.get_node( self.leader_path) if self._LEADER in nodes else None if leader: client_id = self._client.client_id if not self._ctl and leader[0] == self._name and client_id is not None \ and client_id[0] != leader[1].ephemeralOwner: logger.info( 'I am leader but not owner of the session. Removing leader node' ) self._client.delete(self.leader_path) leader = None if leader: member = Member(-1, leader[0], None, {}) member = ([m for m in members if m.name == leader[0]] or [member])[0] leader = Leader(leader[1].version, leader[1].ephemeralOwner, member) self._fetch_cluster = member.index == -1 # failover key failover = self.get_node( self.failover_path, watch=self.cluster_watcher) if self._FAILOVER in nodes else None failover = failover and Failover.from_node(failover[1].version, failover[0]) self._cluster = Cluster(initialize, config, leader, last_leader_operation, members, failover)
def test_check_access(self, mock_dcs): mock_dcs.cluster = get_cluster_initialized_without_leader() mock_dcs.cluster.members[1].data['api_url'] = 'http://127.0.0.1z:8011/patroni' mock_dcs.cluster.members.append(Member(0, 'bad-api-url', 30, {'api_url': 123})) mock_rh = Mock() mock_rh.client_address = ('127.0.0.2',) self.assertIsNot(self.srv.check_access(mock_rh), True) mock_rh.client_address = ('127.0.0.1',) mock_rh.request.getpeercert.return_value = None self.assertIsNot(self.srv.check_access(mock_rh), True)
def _load_cluster(self): prefix = self.client_path('') response = self._sync_obj.get(prefix, recursive=True) if not response: return Cluster(None, None, None, None, [], None, None, None) nodes = { os.path.relpath(key, prefix).replace('\\', '/'): value for key, value in response.items() } # get initialize flag initialize = nodes.get(self._INITIALIZE) initialize = initialize and initialize['value'] # get global dynamic configuration config = nodes.get(self._CONFIG) config = config and ClusterConfig.from_node(config['index'], config['value']) # get timeline history history = nodes.get(self._HISTORY) history = history and TimelineHistory.from_node( history['index'], history['value']) # get last leader operation last_leader_operation = nodes.get(self._LEADER_OPTIME) last_leader_operation = 0 if last_leader_operation is None else int( last_leader_operation['value']) # get list of members members = [ self.member(k, n) for k, n in nodes.items() if k.startswith(self._MEMBERS) and k.count('/') == 1 ] # get leader leader = nodes.get(self._LEADER) if leader: member = Member(-1, leader['value'], None, {}) member = ([m for m in members if m.name == leader['value']] or [member])[0] leader = Leader(leader['index'], None, member) # failover key failover = nodes.get(self._FAILOVER) if failover: failover = Failover.from_node(failover['index'], failover['value']) # get synchronization state sync = nodes.get(self._SYNC) sync = SyncState.from_node(sync and sync['index'], sync and sync['value']) return Cluster(initialize, config, leader, last_leader_operation, members, failover, sync, history)
def setUp(self): self.data_dir = 'data/test0' if not os.path.exists(self.data_dir): os.makedirs(self.data_dir) self.p = Postgresql({'name': 'test0', 'scope': 'batman', 'data_dir': self.data_dir, 'retry_timeout': 10, 'listen': '127.0.0.1, *:5432', 'connect_address': '127.0.0.2:5432', 'authentication': {'superuser': {'username': '******', 'password': '******'}, 'replication': {'username': '******', 'password': '******'}}, 'use_pg_rewind': True, 'parameters': self._PARAMETERS, 'recovery_conf': {'foo': 'bar'}, 'callbacks': {'on_start': 'true', 'on_stop': 'true', 'on_restart': 'true', 'on_role_change': 'true', 'on_reload': 'true' }, 'restore': 'true'}) self.leadermem = Member(0, 'leader', 28, {'conn_url': 'postgres://*****:*****@127.0.0.1:5435/postgres'}) self.leader = Leader(-1, 28, self.leadermem) self.other = Member(0, 'test1', 28, {'conn_url': 'postgres://*****:*****@127.0.0.1:5433/postgres', 'tags': {'replicatefrom': 'leader'}}) self.me = Member(0, 'test0', 28, {'conn_url': 'postgres://*****:*****@127.0.0.1:5434/postgres'})
def setUp(self): self.p = Postgresql({'name': 'test0', 'scope': 'batman', 'data_dir': 'data/test0', 'listen': '127.0.0.1, *:5432', 'connect_address': '127.0.0.2:5432', 'pg_hba': ['hostssl all all 0.0.0.0/0 md5', 'host all all 0.0.0.0/0 md5'], 'superuser': {'password': '******'}, 'admin': {'username': '******', 'password': '******'}, 'pg_rewind': {'username': '******', 'password': '******'}, 'replication': {'username': '******', 'password': '******', 'network': '127.0.0.1/32'}, 'parameters': {'foo': 'bar'}, 'recovery_conf': {'foo': 'bar'}, 'callbacks': {'on_start': 'true', 'on_stop': 'true', 'on_restart': 'true', 'on_role_change': 'true', 'on_reload': 'true' }, 'restore': 'true'}) if not os.path.exists(self.p.data_dir): os.makedirs(self.p.data_dir) self.leadermem = Member(0, 'leader', 28, {'conn_url': 'postgres://*****:*****@127.0.0.1:5435/postgres'}) self.leader = Leader(-1, 28, self.leadermem) self.other = Member(0, 'test1', 28, {'conn_url': 'postgres://*****:*****@127.0.0.1:5433/postgres'}) self.me = Member(0, 'test0', 28, {'conn_url': 'postgres://*****:*****@127.0.0.1:5434/postgres'})
def _inner_load_cluster(self): self._fetch_cluster = False self.event.clear() nodes = set( self.get_children(self.client_path(''), self.cluster_watcher)) if not nodes: self._fetch_cluster = True # get initialize flag initialize = (self.get_node(self.initialize_path) or [None])[0] if self._INITIALIZE in nodes else None # get list of members members = self.load_members() if self._MEMBERS[:-1] in nodes else [] # get leader leader = self.get_node( self.leader_path) if self._LEADER in nodes else None if leader: client_id = self._client.client_id if leader[0] == self._name and client_id is not None and client_id[ 0] != leader[1].ephemeralOwner: logger.info( 'I am leader but not owner of the session. Removing leader node' ) self._client.delete(self.leader_path) leader = None if leader: member = Member(-1, leader[0], None, {}) member = ([m for m in members if m.name == leader[0]] or [member])[0] leader = Leader(leader[1].version, leader[1].ephemeralOwner, member) self._fetch_cluster = member.index == -1 # failover key failover = self.get_node( self.failover_path, watch=self.cluster_watcher) if self._FAILOVER in nodes else None if failover: failover = Failover.from_node(failover[1].version, failover[0]) # get last leader operation optime = self.get_node( self.leader_optime_path ) if self._OPTIME in nodes and self._fetch_cluster else None self._last_leader_operation = 0 if optime is None else int(optime[0]) self._cluster = Cluster(initialize, leader, self._last_leader_operation, members, failover)
def _inner_load_cluster(self): self._fetch_cluster = False self.event.clear() nodes = set(self.get_children(self.client_path(''), self.cluster_watcher)) if not nodes: self._fetch_cluster = True # get initialize flag initialize = (self.get_node(self.initialize_path) or [None])[0] if self._INITIALIZE in nodes else None # get global dynamic configuration config = self.get_node(self.config_path, watch=self.cluster_watcher) if self._CONFIG in nodes else None config = config and ClusterConfig.from_node(config[1].version, config[0], config[1].mzxid) # get timeline history history = self.get_node(self.history_path, watch=self.cluster_watcher) if self._HISTORY in nodes else None history = history and TimelineHistory.from_node(history[1].mzxid, history[0]) # get synchronization state sync = self.get_node(self.sync_path, watch=self.cluster_watcher) if self._SYNC in nodes else None sync = SyncState.from_node(sync and sync[1].version, sync and sync[0]) # get list of members sync_standby = sync.leader == self._name and sync.members or [] members = self.load_members(sync_standby) if self._MEMBERS[:-1] in nodes else [] # get leader leader = self.get_node(self.leader_path) if self._LEADER in nodes else None if leader: client_id = self._client.client_id if not self._ctl and leader[0] == self._name and client_id is not None \ and client_id[0] != leader[1].ephemeralOwner: logger.info('I am leader but not owner of the session. Removing leader node') self._client.delete(self.leader_path) leader = None if leader: member = Member(-1, leader[0], None, {}) member = ([m for m in members if m.name == leader[0]] or [member])[0] leader = Leader(leader[1].version, leader[1].ephemeralOwner, member) self._fetch_cluster = member.index == -1 # get last leader operation last_leader_operation = self._OPTIME in nodes and self.get_leader_optime(leader) # failover key failover = self.get_node(self.failover_path, watch=self.cluster_watcher) if self._FAILOVER in nodes else None failover = failover and Failover.from_node(failover[1].version, failover[0]) return Cluster(initialize, config, leader, last_leader_operation, members, failover, sync, history)
def test_real_ttl(self): now = datetime.datetime.utcnow() member = Member(0, 'a', 'b', 'c', (now + datetime.timedelta(seconds=2)).strftime('%Y-%m-%dT%H:%M:%S.%fZ'), None) self.assertLess(member.real_ttl(), 2) self.assertEquals(Member(0, 'a', 'b', 'c', '', None).real_ttl(), -1)
def member(node): return Member.from_node(node['ModifyIndex'], os.path.basename(node['Key']), node.get('Session'), node['Value'])
def test_do_POST_switchover(self, dcs): dcs.loop_wait = 10 cluster = dcs.get_cluster.return_value cluster.is_synchronous_mode.return_value = False cluster.is_paused.return_value = False post = 'POST /switchover HTTP/1.0' + self._authorization + '\nContent-Length: ' MockRestApiServer(RestApiHandler, post + '7\n\n{"1":2}') request = post + '0\n\n' MockRestApiServer(RestApiHandler, request) cluster.leader.name = 'postgresql1' MockRestApiServer(RestApiHandler, request) request = post + '25\n\n{"leader": "postgresql1"}' cluster.is_paused.return_value = True MockRestApiServer(RestApiHandler, request) cluster.is_paused.return_value = False for cluster.is_synchronous_mode.return_value in (True, False): MockRestApiServer(RestApiHandler, request) cluster.leader.name = 'postgresql2' request = post + '53\n\n{"leader": "postgresql1", "candidate": "postgresql2"}' MockRestApiServer(RestApiHandler, request) cluster.leader.name = 'postgresql1' for cluster.is_synchronous_mode.return_value in (True, False): MockRestApiServer(RestApiHandler, request) cluster.members = [ Member(0, 'postgresql0', 30, {'api_url': 'http'}), Member(0, 'postgresql2', 30, {'api_url': 'http'}) ] MockRestApiServer(RestApiHandler, request) cluster.failover = None MockRestApiServer(RestApiHandler, request) dcs.get_cluster.side_effect = [cluster] MockRestApiServer(RestApiHandler, request) cluster2 = cluster.copy() cluster2.leader.name = 'postgresql0' cluster2.is_unlocked.return_value = False dcs.get_cluster.side_effect = [cluster, cluster2] MockRestApiServer(RestApiHandler, request) cluster2.leader.name = 'postgresql2' dcs.get_cluster.side_effect = [cluster, cluster2] MockRestApiServer(RestApiHandler, request) dcs.get_cluster.side_effect = None dcs.manual_failover.return_value = False MockRestApiServer(RestApiHandler, request) dcs.manual_failover.return_value = True with patch.object(MockHa, 'fetch_nodes_statuses', Mock(return_value=[])): MockRestApiServer(RestApiHandler, request) # Valid future date request = post + '103\n\n{"leader": "postgresql1", "member": "postgresql2",' +\ ' "scheduled_at": "6016-02-15T18:13:30.568224+01:00"}' MockRestApiServer(RestApiHandler, request) with patch.object(MockPatroni, 'dcs') as d: d.manual_failover.return_value = False MockRestApiServer(RestApiHandler, request) # Exception: No timezone specified request = post + '97\n\n{"leader": "postgresql1", "member": "postgresql2",' +\ ' "scheduled_at": "6016-02-15T18:13:30.568224"}' MockRestApiServer(RestApiHandler, request) # Exception: Scheduled in the past request = post + '103\n\n{"leader": "postgresql1", "member": "postgresql2", "scheduled_at": "' MockRestApiServer(RestApiHandler, request + '1016-02-15T18:13:30.568224+01:00"}') # Invalid date self.assertIsNotNone( MockRestApiServer(RestApiHandler, request + '2010-02-29T18:13:30.568224+01:00"}'))
class TestHa(unittest.TestCase): @patch('socket.getaddrinfo', socket_getaddrinfo) @patch('psycopg2.connect', psycopg2_connect) @patch('patroni.dcs.dcs_modules', Mock(return_value=['patroni.dcs.foo', 'patroni.dcs.etcd'])) @patch.object(etcd.Client, 'read', etcd_read) def setUp(self): with patch.object(Client, 'machines') as mock_machines: mock_machines.__get__ = Mock(return_value=['http://*****:*****@patch('patroni.dcs.etcd.Etcd.initialize', return_value=True) def test_start_as_standby_leader(self, initialize): self.p.data_directory_empty = true self.ha.cluster = get_cluster_not_initialized_without_leader(cluster_config=ClusterConfig(0, {}, 0)) self.ha.cluster.is_unlocked = true self.ha.patroni.config._dynamic_configuration = {"standby_cluster": {"port": 5432}} self.assertEqual(self.ha.run_cycle(), 'trying to bootstrap a new standby leader') @patch.object(Cluster, 'get_clone_member', Mock(return_value=Member(0, 'test', 1, {'api_url': 'http://127.0.0.1:8011/patroni'}))) @patch.object(Postgresql, 'create_replica', Mock(return_value=0)) def test_start_as_cascade_replica_in_standby_cluster(self): self.p.data_directory_empty = true self.ha.cluster = get_standby_cluster_initialized_with_only_leader() self.ha.cluster.is_unlocked = false self.assertEqual(self.ha.run_cycle(), "trying to bootstrap from replica 'test'") def test_recover_replica_failed(self): self.p.controldata = lambda: {'Database cluster state': 'in recovery', 'Database system identifier': SYSID} self.p.is_running = false self.p.follow = false self.assertEqual(self.ha.run_cycle(), 'starting as a secondary') self.assertEqual(self.ha.run_cycle(), 'failed to start postgres') def test_recover_former_master(self): self.p.follow = false self.p.is_running = false self.p.name = 'leader' self.p.set_role('master') self.p.controldata = lambda: {'Database cluster state': 'shut down', 'Database system identifier': SYSID} self.ha.cluster = get_cluster_initialized_with_leader() self.assertEqual(self.ha.run_cycle(), 'starting as readonly because i had the session lock') @patch.object(Postgresql, 'fix_cluster_state', Mock()) def test_crash_recovery(self): self.p.is_running = false self.p.controldata = lambda: {'Database cluster state': 'in production', 'Database system identifier': SYSID} self.assertEqual(self.ha.run_cycle(), 'doing crash recovery in a single user mode') @patch.object(Postgresql, 'rewind_needed_and_possible', Mock(return_value=True)) def test_recover_with_rewind(self): self.p.is_running = false self.ha.cluster = get_cluster_initialized_with_leader() self.assertEqual(self.ha.run_cycle(), 'running pg_rewind from leader') @patch('sys.exit', return_value=1) @patch('patroni.ha.Ha.sysid_valid', MagicMock(return_value=True)) def test_sysid_no_match(self, exit_mock): self.p.controldata = lambda: {'Database cluster state': 'in recovery', 'Database system identifier': '123'} self.ha.run_cycle() exit_mock.assert_called_once_with(1) @patch.object(Cluster, 'is_unlocked', Mock(return_value=False)) def test_start_as_readonly(self): self.p.is_leader = false self.p.is_healthy = true self.ha.has_lock = true self.p.controldata = lambda: {'Database cluster state': 'in production', 'Database system identifier': SYSID} self.assertEqual(self.ha.run_cycle(), 'promoted self to leader because i had the session lock') @patch('psycopg2.connect', psycopg2_connect) def test_acquire_lock_as_master(self): self.assertEqual(self.ha.run_cycle(), 'acquired session lock as a leader') def test_promoted_by_acquiring_lock(self): self.ha.is_healthiest_node = true self.p.is_leader = false self.assertEqual(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') def test_long_promote(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.p.is_leader = false self.p.set_role('master') self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') def test_demote_after_failing_to_obtain_lock(self): self.ha.acquire_lock = false self.assertEqual(self.ha.run_cycle(), 'demoted self after trying and failing to obtain lock') def test_follow_new_leader_after_failing_to_obtain_lock(self): self.ha.is_healthiest_node = true self.ha.acquire_lock = false self.p.is_leader = false self.assertEqual(self.ha.run_cycle(), 'following new leader after trying and failing to obtain lock') def test_demote_because_not_healthiest(self): self.ha.is_healthiest_node = false self.assertEqual(self.ha.run_cycle(), 'demoting self because i am not the healthiest node') def test_follow_new_leader_because_not_healthiest(self): self.ha.is_healthiest_node = false self.p.is_leader = false self.assertEqual(self.ha.run_cycle(), 'following a different leader because i am not the healthiest node') def test_promote_because_have_lock(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.p.is_leader = false self.assertEqual(self.ha.run_cycle(), 'promoted self to leader because i had the session lock') def test_promote_without_watchdog(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.p.is_leader = true with patch.object(Watchdog, 'activate', Mock(return_value=False)): self.assertEqual(self.ha.run_cycle(), 'Demoting self because watchdog could not be activated') self.p.is_leader = false self.assertEqual(self.ha.run_cycle(), 'Not promoting self because watchdog could not be activated') def test_leader_with_lock(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') def test_demote_because_not_having_lock(self): self.ha.cluster.is_unlocked = false with patch.object(Watchdog, 'is_running', PropertyMock(return_value=True)): self.assertEqual(self.ha.run_cycle(), 'demoting self because i do not have the lock and i was a leader') def test_demote_because_update_lock_failed(self): self.ha.cluster.is_unlocked = false self.ha.has_lock = true self.ha.update_lock = false self.assertEqual(self.ha.run_cycle(), 'demoted self because failed to update leader lock in DCS') self.p.is_leader = false self.assertEqual(self.ha.run_cycle(), 'not promoting because failed to update leader lock in DCS') def test_follow(self): self.ha.cluster.is_unlocked = false self.p.is_leader = false self.assertEqual(self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader') self.ha.patroni.replicatefrom = "foo" self.assertEqual(self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader') def test_follow_in_pause(self): self.ha.cluster.is_unlocked = false self.ha.is_paused = true self.assertEqual(self.ha.run_cycle(), 'PAUSE: continue to run as master without lock') self.p.is_leader = false self.assertEqual(self.ha.run_cycle(), 'PAUSE: no action') @patch.object(Postgresql, 'rewind_needed_and_possible', Mock(return_value=True)) def test_follow_triggers_rewind(self): self.p.is_leader = false self.p.trigger_check_diverged_lsn() self.ha.cluster = get_cluster_initialized_with_leader() self.assertEqual(self.ha.run_cycle(), 'running pg_rewind from leader') def test_no_etcd_connection_master_demote(self): self.ha.load_cluster_from_dcs = Mock(side_effect=DCSError('Etcd is not responding properly')) self.assertEqual(self.ha.run_cycle(), 'demoted self because DCS is not accessible and i was a leader') @patch('time.sleep', Mock()) def test_bootstrap_from_another_member(self): self.ha.cluster = get_cluster_initialized_with_leader() self.assertEqual(self.ha.bootstrap(), 'trying to bootstrap from replica \'other\'') def test_bootstrap_waiting_for_leader(self): self.ha.cluster = get_cluster_initialized_without_leader() self.assertEqual(self.ha.bootstrap(), 'waiting for leader to bootstrap') def test_bootstrap_without_leader(self): self.ha.cluster = get_cluster_initialized_without_leader() self.p.can_create_replica_without_replication_connection = MagicMock(return_value=True) self.assertEqual(self.ha.bootstrap(), 'trying to bootstrap (without leader)') def test_bootstrap_initialize_lock_failed(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.assertEqual(self.ha.bootstrap(), 'failed to acquire initialize lock') def test_bootstrap_initialized_new_cluster(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.e.initialize = true self.assertEqual(self.ha.bootstrap(), 'trying to bootstrap a new cluster') self.p.is_leader = false self.assertEqual(self.ha.run_cycle(), 'waiting for end of recovery after bootstrap') self.p.is_leader = true self.assertEqual(self.ha.run_cycle(), 'running post_bootstrap') self.assertEqual(self.ha.run_cycle(), 'initialized a new cluster') def test_bootstrap_release_initialize_key_on_failure(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.e.initialize = true self.ha.bootstrap() self.p.is_running = false self.assertRaises(PatroniException, self.ha.post_bootstrap) def test_bootstrap_release_initialize_key_on_watchdog_failure(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.e.initialize = true self.ha.bootstrap() self.p.is_running.return_value = MockPostmaster() self.p.is_leader = true with patch.object(Watchdog, 'activate', Mock(return_value=False)): self.assertEqual(self.ha.post_bootstrap(), 'running post_bootstrap') self.assertRaises(PatroniException, self.ha.post_bootstrap) @patch('psycopg2.connect', psycopg2_connect) def test_reinitialize(self): self.assertIsNotNone(self.ha.reinitialize()) self.ha.cluster = get_cluster_initialized_with_leader() self.assertIsNone(self.ha.reinitialize(True)) self.assertIsNotNone(self.ha.reinitialize()) self.ha.state_handler.name = self.ha.cluster.leader.name self.assertIsNotNone(self.ha.reinitialize()) @patch('time.sleep', Mock()) def test_restart(self): self.assertEqual(self.ha.restart({}), (True, 'restarted successfully')) self.p.restart = Mock(return_value=None) self.assertEqual(self.ha.restart({}), (False, 'postgres is still starting')) self.p.restart = false self.assertEqual(self.ha.restart({}), (False, 'restart failed')) self.ha.cluster = get_cluster_initialized_with_leader() self.ha.reinitialize() self.assertEqual(self.ha.restart({}), (False, 'reinitialize already in progress')) with patch.object(self.ha, "restart_matches", return_value=False): self.assertEqual(self.ha.restart({'foo': 'bar'}), (False, "restart conditions are not satisfied")) @patch('os.kill', Mock()) def test_restart_in_progress(self): with patch('patroni.async_executor.AsyncExecutor.busy', PropertyMock(return_value=True)): self.ha.restart({}, run_async=True) self.assertTrue(self.ha.restart_scheduled()) self.assertEqual(self.ha.run_cycle(), 'restart in progress') self.ha.cluster = get_cluster_initialized_with_leader() self.assertEqual(self.ha.run_cycle(), 'restart in progress') self.ha.has_lock = true self.assertEqual(self.ha.run_cycle(), 'updated leader lock during restart') self.ha.update_lock = false self.p.set_role('master') with patch('patroni.async_executor.CriticalTask.cancel', Mock(return_value=False)): with patch('patroni.postgresql.Postgresql.terminate_starting_postmaster') as mock_terminate: self.assertEqual(self.ha.run_cycle(), 'lost leader lock during restart') mock_terminate.assert_called() @patch('requests.get', requests_get) def test_manual_failover_from_leader(self): self.ha.fetch_node_status = get_node_status() self.ha.has_lock = true self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', '', None)) self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, '', self.p.name, None)) self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, '', 'blabla', None)) self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') f = Failover(0, self.p.name, '', None) self.ha.cluster = get_cluster_initialized_with_leader(f) self.assertEqual(self.ha.run_cycle(), 'manual failover: demoting myself') self.p.rewind_needed_and_possible = true self.assertEqual(self.ha.run_cycle(), 'manual failover: demoting myself') self.ha.fetch_node_status = get_node_status(nofailover=True) self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') self.ha.fetch_node_status = get_node_status(watchdog_failed=True) self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') self.ha.fetch_node_status = get_node_status(wal_position=1) self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') # manual failover from the previous leader to us won't happen if we hold the nofailover flag self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, None)) self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') # Failover scheduled time must include timezone scheduled = datetime.datetime.now() self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) self.ha.run_cycle() scheduled = datetime.datetime.utcnow().replace(tzinfo=tzutc) self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) self.assertEqual('no action. i am the leader with the lock', self.ha.run_cycle()) scheduled = scheduled + datetime.timedelta(seconds=30) self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) self.assertEqual('no action. i am the leader with the lock', self.ha.run_cycle()) scheduled = scheduled + datetime.timedelta(seconds=-600) self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) self.assertEqual('no action. i am the leader with the lock', self.ha.run_cycle()) scheduled = None self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) self.assertEqual('no action. i am the leader with the lock', self.ha.run_cycle()) @patch('requests.get', requests_get) def test_manual_failover_from_leader_in_pause(self): self.ha.has_lock = true self.ha.is_paused = true scheduled = datetime.datetime.now() self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) self.assertEqual('PAUSE: no action. i am the leader with the lock', self.ha.run_cycle()) self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, self.p.name, '', None)) self.assertEqual('PAUSE: no action. i am the leader with the lock', self.ha.run_cycle()) @patch('requests.get', requests_get) def test_manual_failover_from_leader_in_synchronous_mode(self): self.p.is_leader = true self.ha.has_lock = true self.ha.is_synchronous_mode = true self.ha.is_failover_possible = false self.ha.process_sync_replication = Mock() self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, self.p.name, 'a', None), (self.p.name, None)) self.assertEqual('no action. i am the leader with the lock', self.ha.run_cycle()) self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, self.p.name, 'a', None), (self.p.name, 'a')) self.ha.is_failover_possible = true self.assertEqual('manual failover: demoting myself', self.ha.run_cycle()) @patch('requests.get', requests_get) def test_manual_failover_process_no_leader(self): self.p.is_leader = false self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, '', self.p.name, None)) self.assertEqual(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, '', 'leader', None)) self.p.set_role('replica') self.assertEqual(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') self.ha.fetch_node_status = get_node_status() # accessible, in_recovery self.assertEqual(self.ha.run_cycle(), 'following a different leader because i am not the healthiest node') self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, self.p.name, '', None)) self.assertEqual(self.ha.run_cycle(), 'following a different leader because i am not the healthiest node') self.ha.fetch_node_status = get_node_status(reachable=False) # inaccessible, in_recovery self.p.set_role('replica') self.assertEqual(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') # set failover flag to True for all members of the cluster # this should elect the current member, as we are not going to call the API for it. self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, '', 'other', None)) self.ha.fetch_node_status = get_node_status(nofailover=True) # accessible, in_recovery self.p.set_role('replica') self.assertEqual(self.ha.run_cycle(), 'promoted self to leader by acquiring session lock') # same as previous, but set the current member to nofailover. In no case it should be elected as a leader self.ha.patroni.nofailover = True self.assertEqual(self.ha.run_cycle(), 'following a different leader because I am not allowed to promote') def test_manual_failover_process_no_leader_in_pause(self): self.ha.is_paused = true self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, '', 'other', None)) self.assertEqual(self.ha.run_cycle(), 'PAUSE: continue to run as master without lock') self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, 'leader', '', None)) self.assertEqual(self.ha.run_cycle(), 'PAUSE: continue to run as master without lock') self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, 'leader', 'blabla', None)) self.assertEqual('PAUSE: acquired session lock as a leader', self.ha.run_cycle()) self.p.is_leader = false self.p.set_role('replica') self.ha.cluster = get_cluster_initialized_without_leader(failover=Failover(0, 'leader', self.p.name, None)) self.assertEqual(self.ha.run_cycle(), 'PAUSE: promoted self to leader by acquiring session lock') def test_is_healthiest_node(self): self.ha.state_handler.is_leader = false self.ha.patroni.nofailover = False self.ha.fetch_node_status = get_node_status() self.assertTrue(self.ha.is_healthiest_node()) with patch.object(Watchdog, 'is_healthy', PropertyMock(return_value=False)): self.assertFalse(self.ha.is_healthiest_node()) with patch('patroni.postgresql.Postgresql.is_starting', return_value=True): self.assertFalse(self.ha.is_healthiest_node()) self.ha.is_paused = true self.assertFalse(self.ha.is_healthiest_node()) def test__is_healthiest_node(self): self.assertTrue(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.p.is_leader = false self.ha.fetch_node_status = get_node_status() # accessible, in_recovery self.assertTrue(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.fetch_node_status = get_node_status(in_recovery=False) # accessible, not in_recovery self.assertFalse(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.fetch_node_status = get_node_status(wal_position=11) # accessible, in_recovery, wal position ahead self.assertFalse(self.ha._is_healthiest_node(self.ha.old_cluster.members)) with patch('patroni.postgresql.Postgresql.timeline_wal_position', return_value=(1, 1)): self.assertFalse(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.patroni.nofailover = True self.assertFalse(self.ha._is_healthiest_node(self.ha.old_cluster.members)) self.ha.patroni.nofailover = False @patch('requests.get', requests_get) def test_fetch_node_status(self): member = Member(0, 'test', 1, {'api_url': 'http://127.0.0.1:8011/patroni'}) self.ha.fetch_node_status(member) member = Member(0, 'test', 1, {'api_url': 'http://*****:*****@patch.object(Postgresql, 'rewind_needed_and_possible', Mock(return_value=True)) def test_process_unhealthy_standby_cluster_as_cascade_replica(self): self.p.is_leader = false self.p.name = 'replica' self.ha.cluster = get_standby_cluster_initialized_with_only_leader() self.ha.is_unlocked = true self.assertTrue(self.ha.run_cycle().startswith('running pg_rewind from remote_master:')) def test_recover_unhealthy_leader_in_standby_cluster(self): self.p.is_leader = false self.p.name = 'leader' self.p.is_running = false self.p.follow = false self.ha.cluster = get_standby_cluster_initialized_with_only_leader() self.assertEqual(self.ha.run_cycle(), 'starting as a standby leader because i had the session lock') def test_recover_unhealthy_unlocked_standby_cluster(self): self.p.is_leader = false self.p.name = 'leader' self.p.is_running = false self.p.follow = false self.ha.cluster = get_standby_cluster_initialized_with_only_leader() self.ha.cluster.is_unlocked = true self.ha.has_lock = false self.assertEqual(self.ha.run_cycle(), 'trying to follow a remote master because standby cluster is unhealthy') def test_failed_to_update_lock_in_pause(self): self.ha.update_lock = false self.ha.is_paused = true self.p.name = 'leader' self.ha.cluster = get_cluster_initialized_with_leader() self.assertEqual(self.ha.run_cycle(), 'PAUSE: continue to run as master after failing to update leader lock in DCS') def test_postgres_unhealthy_in_pause(self): self.ha.is_paused = true self.p.is_healthy = false self.assertEqual(self.ha.run_cycle(), 'PAUSE: postgres is not running') self.ha.has_lock = true self.assertEqual(self.ha.run_cycle(), 'PAUSE: removed leader lock because postgres is not running') def test_no_etcd_connection_in_pause(self): self.ha.is_paused = true self.ha.load_cluster_from_dcs = Mock(side_effect=DCSError('Etcd is not responding properly')) self.assertEqual(self.ha.run_cycle(), 'PAUSE: DCS is not accessible') @patch('patroni.ha.Ha.update_lock', return_value=True) @patch('patroni.ha.Ha.demote') def test_starting_timeout(self, demote, update_lock): def check_calls(seq): for mock, called in seq: if called: mock.assert_called_once() else: mock.assert_not_called() mock.reset_mock() self.ha.has_lock = true self.ha.cluster = get_cluster_initialized_with_leader() self.p.check_for_startup = true self.p.time_in_state = lambda: 30 self.assertEqual(self.ha.run_cycle(), 'PostgreSQL is still starting up, 270 seconds until timeout') check_calls([(update_lock, True), (demote, False)]) self.p.time_in_state = lambda: 350 self.ha.fetch_node_status = get_node_status(reachable=False) # inaccessible, in_recovery self.assertEqual(self.ha.run_cycle(), 'master start has timed out, but continuing to wait because failover is not possible') check_calls([(update_lock, True), (demote, False)]) self.ha.fetch_node_status = get_node_status() # accessible, in_recovery self.assertEqual(self.ha.run_cycle(), 'stopped PostgreSQL because of startup timeout') check_calls([(update_lock, True), (demote, True)]) update_lock.return_value = False self.assertEqual(self.ha.run_cycle(), 'stopped PostgreSQL while starting up because leader key was lost') check_calls([(update_lock, True), (demote, True)]) self.ha.has_lock = false self.p.is_leader = false self.assertEqual(self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader') check_calls([(update_lock, False), (demote, False)]) def test_manual_failover_while_starting(self): self.ha.has_lock = true self.p.check_for_startup = true f = Failover(0, self.p.name, '', None) self.ha.cluster = get_cluster_initialized_with_leader(f) self.ha.fetch_node_status = get_node_status() # accessible, in_recovery self.assertEqual(self.ha.run_cycle(), 'manual failover: demoting myself') @patch('patroni.ha.Ha.demote') def test_failover_immediately_on_zero_master_start_timeout(self, demote): self.p.is_running = false self.ha.cluster = get_cluster_initialized_with_leader(sync=(self.p.name, 'other')) self.ha.cluster.config.data['synchronous_mode'] = True self.ha.patroni.config.set_dynamic_configuration({'master_start_timeout': 0}) self.ha.has_lock = true self.ha.update_lock = true self.ha.fetch_node_status = get_node_status() # accessible, in_recovery self.assertEqual(self.ha.run_cycle(), 'stopped PostgreSQL to fail over after a crash') demote.assert_called_once() @patch('patroni.postgresql.Postgresql.follow') def test_demote_immediate(self, follow): self.ha.has_lock = true self.e.get_cluster = Mock(return_value=get_cluster_initialized_without_leader()) self.ha.demote('immediate') follow.assert_called_once_with(None) def test_process_sync_replication(self): self.ha.has_lock = true mock_set_sync = self.p.set_synchronous_standby = Mock() self.p.name = 'leader' # Test sync key removed when sync mode disabled self.ha.cluster = get_cluster_initialized_with_leader(sync=('leader', 'other')) with patch.object(self.ha.dcs, 'delete_sync_state') as mock_delete_sync: self.ha.run_cycle() mock_delete_sync.assert_called_once() mock_set_sync.assert_called_once_with(None) mock_set_sync.reset_mock() # Test sync key not touched when not there self.ha.cluster = get_cluster_initialized_with_leader() with patch.object(self.ha.dcs, 'delete_sync_state') as mock_delete_sync: self.ha.run_cycle() mock_delete_sync.assert_not_called() mock_set_sync.assert_called_once_with(None) mock_set_sync.reset_mock() self.ha.is_synchronous_mode = true # Test sync standby not touched when picking the same node self.p.pick_synchronous_standby = Mock(return_value=('other', True)) self.ha.cluster = get_cluster_initialized_with_leader(sync=('leader', 'other')) self.ha.run_cycle() mock_set_sync.assert_not_called() mock_set_sync.reset_mock() # Test sync standby is replaced when switching standbys self.p.pick_synchronous_standby = Mock(return_value=('other2', False)) self.ha.dcs.write_sync_state = Mock(return_value=True) self.ha.run_cycle() mock_set_sync.assert_called_once_with('other2') mock_set_sync.reset_mock() # Test sync standby is not disabled when updating dcs fails self.ha.dcs.write_sync_state = Mock(return_value=False) self.ha.run_cycle() mock_set_sync.assert_not_called() mock_set_sync.reset_mock() # Test changing sync standby self.ha.dcs.write_sync_state = Mock(return_value=True) self.ha.dcs.get_cluster = Mock(return_value=get_cluster_initialized_with_leader(sync=('leader', 'other'))) # self.ha.cluster = get_cluster_initialized_with_leader(sync=('leader', 'other')) self.p.pick_synchronous_standby = Mock(return_value=('other2', True)) self.ha.run_cycle() self.ha.dcs.get_cluster.assert_called_once() self.assertEqual(self.ha.dcs.write_sync_state.call_count, 2) # Test updating sync standby key failed due to race self.ha.dcs.write_sync_state = Mock(side_effect=[True, False]) self.ha.run_cycle() self.assertEqual(self.ha.dcs.write_sync_state.call_count, 2) # Test changing sync standby failed due to race self.ha.dcs.write_sync_state = Mock(return_value=True) self.ha.dcs.get_cluster = Mock(return_value=get_cluster_initialized_with_leader(sync=('somebodyelse', None))) self.ha.run_cycle() self.assertEqual(self.ha.dcs.write_sync_state.call_count, 1) # Test sync set to '*' when synchronous_mode_strict is enabled mock_set_sync.reset_mock() self.ha.is_synchronous_mode_strict = true self.p.pick_synchronous_standby = Mock(return_value=(None, False)) self.ha.run_cycle() mock_set_sync.assert_called_once_with('*') def test_sync_replication_become_master(self): self.ha.is_synchronous_mode = true mock_set_sync = self.p.set_synchronous_standby = Mock() self.p.is_leader = false self.p.set_role('replica') self.ha.has_lock = true mock_write_sync = self.ha.dcs.write_sync_state = Mock(return_value=True) self.p.name = 'leader' self.ha.cluster = get_cluster_initialized_with_leader(sync=('other', None)) # When we just became master nobody is sync self.assertEqual(self.ha.enforce_master_role('msg', 'promote msg'), 'promote msg') mock_set_sync.assert_called_once_with(None) mock_write_sync.assert_called_once_with('leader', None, index=0) mock_set_sync.reset_mock() # When we just became master nobody is sync self.p.set_role('replica') mock_write_sync.return_value = False self.assertTrue(self.ha.enforce_master_role('msg', 'promote msg') != 'promote msg') mock_set_sync.assert_not_called() def test_unhealthy_sync_mode(self): self.ha.is_synchronous_mode = true self.p.is_leader = false self.p.set_role('replica') self.p.name = 'other' self.ha.cluster = get_cluster_initialized_without_leader(sync=('leader', 'other2')) mock_write_sync = self.ha.dcs.write_sync_state = Mock(return_value=True) mock_acquire = self.ha.acquire_lock = Mock(return_value=True) mock_follow = self.p.follow = Mock() mock_promote = self.p.promote = Mock() # If we don't match the sync replica we are not allowed to acquire lock self.ha.run_cycle() mock_acquire.assert_not_called() mock_follow.assert_called_once() self.assertEqual(mock_follow.call_args[0][0], None) mock_write_sync.assert_not_called() mock_follow.reset_mock() # If we do match we will try to promote self.ha._is_healthiest_node = true self.ha.cluster = get_cluster_initialized_without_leader(sync=('leader', 'other')) self.ha.run_cycle() mock_acquire.assert_called_once() mock_follow.assert_not_called() mock_promote.assert_called_once() mock_write_sync.assert_called_once_with('other', None, index=0) def test_disable_sync_when_restarting(self): self.ha.is_synchronous_mode = true self.p.name = 'other' self.p.is_leader = false self.p.set_role('replica') mock_restart = self.p.restart = Mock(return_value=True) self.ha.cluster = get_cluster_initialized_with_leader(sync=('leader', 'other')) self.ha.touch_member = Mock(return_value=True) self.ha.dcs.get_cluster = Mock(side_effect=[ get_cluster_initialized_with_leader(sync=('leader', syncstandby)) for syncstandby in ['other', None]]) with patch('time.sleep') as mock_sleep: self.ha.restart({}) mock_restart.assert_called_once() mock_sleep.assert_called() # Restart is still called when DCS connection fails mock_restart.reset_mock() self.ha.dcs.get_cluster = Mock(side_effect=DCSError("foo")) self.ha.restart({}) mock_restart.assert_called_once() # We don't try to fetch the cluster state when touch_member fails mock_restart.reset_mock() self.ha.dcs.get_cluster.reset_mock() self.ha.touch_member = Mock(return_value=False) self.ha.restart({}) mock_restart.assert_called_once() self.ha.dcs.get_cluster.assert_not_called() def test_effective_tags(self): self.ha._disable_sync = True self.assertEqual(self.ha.get_effective_tags(), {'foo': 'bar', 'nosync': True}) self.ha._disable_sync = False self.assertEqual(self.ha.get_effective_tags(), {'foo': 'bar'}) def test_restore_cluster_config(self): self.ha.cluster.config.data.clear() self.ha.has_lock = true self.ha.cluster.is_unlocked = false self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') def test_watch(self): self.ha.cluster = get_cluster_initialized_with_leader() self.ha.watch(0) def test_wakup(self): self.ha.wakeup() def test_shutdown(self): self.p.is_running = false self.ha.has_lock = true self.ha.shutdown() @patch('time.sleep', Mock()) def test_leader_with_empty_directory(self): self.ha.cluster = get_cluster_initialized_with_leader() self.ha.has_lock = true self.p.data_directory_empty = true self.assertEqual(self.ha.run_cycle(), 'released leader key voluntarily as data dir empty and currently leader') self.assertEqual(self.p.role, 'uninitialized') # as has_lock is mocked out, we need to fake the leader key release self.ha.has_lock = false # will not say bootstrap from leader as replica can't self elect self.assertEqual(self.ha.run_cycle(), "trying to bootstrap from replica 'other'") def test_update_cluster_history(self): self.p.get_master_timeline = Mock(return_value=1) self.ha.has_lock = true self.ha.cluster.is_unlocked = false self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') @patch('sys.exit', return_value=1) def test_abort_join(self, exit_mock): self.ha.cluster = get_cluster_not_initialized_without_leader() self.p.is_leader = false self.ha.run_cycle() exit_mock.assert_called_once_with(1) def test_after_pause(self): self.ha.has_lock = true self.ha.cluster.is_unlocked = false self.ha.is_paused = true self.assertEqual(self.ha.run_cycle(), 'PAUSE: no action. i am the leader with the lock') self.ha.is_paused = false self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock')
def member(node): return Member.from_node(node.modifiedIndex, os.path.basename(node.key), node.ttl, node.value)
def member(key, value): return Member.from_node(value['index'], os.path.basename(key), None, value['value'])
def member(name, value, znode): return Member.from_node(znode.version, name, znode.ephemeralOwner, value)
def _load_cluster(self): try: # get list of members response = self.retry(self._api.list_namespaced_pod, self._namespace, label_selector=self._label_selector) members = [self.member(pod) for pod in response.items] response = self.retry(self._api.list_namespaced_kind, self._namespace, label_selector=self._label_selector) nodes = {item.metadata.name: item for item in response.items} config = nodes.get(self.config_path) metadata = config and config.metadata annotations = metadata and metadata.annotations or {} # get initialize flag initialize = annotations.get(self._INITIALIZE) # get global dynamic configuration config = ClusterConfig.from_node( metadata and metadata.resource_version, annotations.get(self._CONFIG) or '{}') # get timeline history history = TimelineHistory.from_node( metadata and metadata.resource_version, annotations.get(self._HISTORY) or '[]') leader = nodes.get(self.leader_path) metadata = leader and leader.metadata self._leader_resource_version = metadata.resource_version if metadata else None self._leader_observed_subsets = leader.subsets if self.__subsets and leader else [] annotations = metadata and metadata.annotations or {} # get last leader operation last_leader_operation = annotations.get(self._OPTIME) last_leader_operation = 0 if last_leader_operation is None else int( last_leader_operation) # get leader leader_record = { n: annotations.get(n) for n in (self._LEADER, 'acquireTime', 'ttl', 'renewTime', 'transitions') if n in annotations } if (leader_record or self._leader_observed_record ) and leader_record != self._leader_observed_record: self._leader_observed_record = leader_record self._leader_observed_time = time.time() leader = leader_record.get(self._LEADER) try: ttl = int(leader_record.get('ttl')) or self._ttl except (TypeError, ValueError): ttl = self._ttl if not metadata or not self._leader_observed_time or self._leader_observed_time + ttl < time.time( ): leader = None if metadata: member = Member(-1, leader, None, {}) member = ([m for m in members if m.name == leader] or [member])[0] leader = Leader(response.metadata.resource_version, None, member) # failover key failover = nodes.get(self.failover_path) metadata = failover and failover.metadata failover = Failover.from_node( metadata and metadata.resource_version, metadata and metadata.annotations) # get synchronization state sync = nodes.get(self.sync_path) metadata = sync and sync.metadata sync = SyncState.from_node(metadata and metadata.resource_version, metadata and metadata.annotations) self._cluster = Cluster(initialize, config, leader, last_leader_operation, members, failover, sync, history) except Exception: logger.exception('get_cluster') raise KubernetesError('Kubernetes API is not responding properly')