def test_existing_zk(self): """ ClusterManager needs to be able to recover from an existing ZK group for scheduler failover. """ manager = ClusterManager(self.client, "/home/my_cluster") instance1 = ServiceInstance(Endpoint("host1", 10000)) member1 = manager.add_member(instance1) instance2 = ServiceInstance(Endpoint("host2", 10000)) member2 = manager.add_member(instance2) assert (self.storage.paths["/home/my_cluster/slaves/member_0000000000"] ["data"] == ServiceInstance.pack(instance1)) assert (self.storage.paths["/home/my_cluster/slaves/member_0000000001"] ["data"] == ServiceInstance.pack(instance2)) manager.promote_member(member1) # Test the new ClusterManager. manager2 = ClusterManager(self.client, "/home/my_cluster") assert len(manager2._cluster.members) == 2 assert member1 in manager2._cluster.members assert member2 in manager2._cluster.members assert manager2._cluster.members[member1] == ServiceInstance.pack( instance1)
def test_promote_member(self): manager = ClusterManager(self.client, "/home/my_cluster") instance = ServiceInstance(Endpoint("host", 10000)) member = manager.add_member(instance) assert manager.promote_member(member) assert not manager.promote_member( member) # The 2nd promotion is a no-op. assert (self.storage.paths["/home/my_cluster/master/member_0000000000"] ["data"] == ServiceInstance.pack(instance))
def test_callbacks(self): manager = ClusterManager(self.client, "/home/my_cluster") # Set up 2 listeners. instance1 = ServiceInstance(Endpoint("host1", 10000)) handler1 = CallbackHandler() listener1 = ClusterListener(self.client, "/home/my_cluster", instance1, handler1.promotion_callback, handler1.demotion_callback, handler1.master_callback, handler1.termination_callback) listener1.start() member1 = manager.add_member(instance1) instance2 = ServiceInstance(Endpoint("host2", 10000)) handler2 = CallbackHandler() listener2 = ClusterListener(self.client, "/home/my_cluster", instance2, handler2.promotion_callback, handler2.demotion_callback, handler2.master_callback) listener2.start() member2 = manager.add_member(instance2) # Test promotion. manager.promote_member(member1) assert handler1.promoted.wait(1) assert handler2.detected.get(True, 1) == instance1 assert (self.storage.paths["/home/my_cluster/master/member_0000000000"] ["data"] == ServiceInstance.pack(instance1)) assert (self.storage.paths["/home/my_cluster/slaves/member_0000000001"] ["data"] == ServiceInstance.pack(instance2)) manager.promote_member(member2) assert handler1.demoted.wait(1) assert handler2.promoted.wait(1) assert (self.storage.paths["/home/my_cluster/master/member_0000000001"] ["data"] == ServiceInstance.pack(instance2)) assert "/home/my_cluster/master/member_0000000000" not in self.storage.paths manager.remove_member(member2) assert handler2.demoted.wait(1) # Test removing cluster. manager.remove_member(member1) manager.delete_cluster() assert handler1.terminated.wait(1)
def add_member(self, service_instance): """ Add the member to the ZooKeeper group. NOTE: - New members are slaves until being promoted. - A new member is not added if the specified service_instance already exists in the group. :return: The member ID for the ServiceInstance generated by ZooKeeper. """ if not isinstance(service_instance, ServiceInstance): raise TypeError("'service_instance' should be a ServiceInstance") content = ServiceInstance.pack(service_instance) for k, v in self._cluster.members.items(): if content == v: log.info( "%s not added because it already exists in the group" % service_instance) return k znode_path = self._client.create(posixpath.join( self._cluster.slaves_group, self._cluster.MEMBER_PREFIX), content, sequence=True) _, member_id = posixpath.split(znode_path) with self._lock: self._cluster.members[member_id] = content return member_id
def __init__(self, client, cluster_path, self_instance=None, promotion_callback=None, demotion_callback=None, master_callback=None, termination_callback=None): """ :param client: Kazoo client. :param cluster_path: The path for this cluster on ZooKeeper. :param self_instance: The local ServiceInstance associated with this listener. :param promotion_callback: Invoked when 'self_instance' is promoted. :param demotion_callback: Invoked when 'self_instance' is demoted. :param master_callback: Invoked when there is a master change otherwise. :param termination_callback: Invoked when the cluster is terminated. NOTE: Callbacks are executed synchronously in Kazoo's completion thread to ensure the delivery order of events. Blocking the callback method means no future callbacks will be invoked. """ self._client = client self._cluster = Cluster(cluster_path) self._self_content = ServiceInstance.pack( self_instance) if self_instance else None self._master = None self._master_content = None self._promotion_callback = promotion_callback or (lambda: True) self._demotion_callback = demotion_callback or (lambda: True) self._master_callback = master_callback or (lambda x: True) self._termination_callback = termination_callback or (lambda: True) self._children_watch = None # Set when the watcher detects that the master group exists.
def test_internal_monitor(mock_group_impl_validator, MockActiveKazooGroup): mock_zk = mock.Mock(spec=KazooClient) mock_group = mock.MagicMock(spec=GroupInterface) MockActiveKazooGroup.mock_add_spec(ActiveKazooGroup) MockActiveKazooGroup.return_value = mock_group # by default it tries to assert that the group impl is a subclass of GroupInterface # since the group impl will be a mock, it doesn't pass that check, so we mock the validator # as well. mock_group_impl_validator.return_value = True def devnull(*args, **kwargs): pass serverset = ServerSet( mock_zk, '/some/path/to/group', on_join=devnull, on_leave=devnull) members = [Membership(id) for id in range(2)] print("Members are: %s" % members) serverset._internal_monitor(frozenset(members)) for call in mock_group.info.mock_calls: _, (_, callback), _ = call callback(ServiceInstance.unpack(SERVICE_INSTANCE_JSON)) assert len(serverset._members) == 2
def __init__(self, client, cluster_path, self_instance=None, promotion_callback=None, demotion_callback=None, master_callback=None, termination_callback=None): """ :param client: Kazoo client. :param cluster_path: The path for this cluster on ZooKeeper. :param self_instance: The local ServiceInstance associated with this listener. :param promotion_callback: Invoked when 'self_instance' is promoted. :param demotion_callback: Invoked when 'self_instance' is demoted. :param master_callback: Invoked when there is a master change otherwise. :param termination_callback: Invoked when the cluster is terminated. NOTE: Callbacks are executed synchronously in Kazoo's completion thread to ensure the delivery order of events. Blocking the callback method means no future callbacks will be invoked. """ self._client = client self._cluster = Cluster(cluster_path) self._self_content = ServiceInstance.pack(self_instance) if self_instance else None self._master = None self._master_content = None self._promotion_callback = promotion_callback or (lambda: True) self._demotion_callback = demotion_callback or (lambda: True) self._master_callback = master_callback or (lambda x: True) self._termination_callback = termination_callback or (lambda: True) self._children_watch = None # Set when the watcher detects that the master group exists.
def test_internal_monitor(mock_group_impl_validator, MockActiveKazooGroup): mock_zk = mock.Mock(spec=KazooClient) mock_group = mock.MagicMock(spec=GroupInterface) MockActiveKazooGroup.mock_add_spec(ActiveKazooGroup) MockActiveKazooGroup.return_value = mock_group # by default it tries to assert that the group impl is a subclass of GroupInterface # since the group impl will be a mock, it doesn't pass that check, so we mock the validator # as well. mock_group_impl_validator.return_value = True def devnull(*args, **kwargs): pass serverset = ServerSet(mock_zk, '/some/path/to/group', on_join=devnull, on_leave=devnull) members = [Membership(id) for id in range(2)] print("Members are: %s" % members) serverset._internal_monitor(frozenset(members)) for call in mock_group.info.mock_calls: _, (_, callback), _ = call callback(ServiceInstance.unpack(SERVICE_INSTANCE_JSON)) assert len(serverset._members) == 2
def add_member(self, service_instance): """ Add the member to the ZooKeeper group. NOTE: - New members are slaves until being promoted. - A new member is not added if the specified service_instance already exists in the group. :return: The member ID for the ServiceInstance generated by ZooKeeper. """ if not isinstance(service_instance, ServiceInstance): raise TypeError("'service_instance' should be a ServiceInstance") content = ServiceInstance.pack(service_instance) for k, v in self._cluster.members.items(): if content == v: log.info("%s not added because it already exists in the group" % service_instance) return k znode_path = self._client.create( posixpath.join(self._cluster.slaves_group, self._cluster.MEMBER_PREFIX), content, sequence=True) _, member_id = posixpath.split(znode_path) with self._lock: self._cluster.members[member_id] = content return member_id
def test_demote(self): task_control = FakeTaskControl() runner = MysosTaskRunner( self._self_instance, self._client, "/home/test/my_cluster", NoopPackageInstaller(), task_control, self._state_manager) manager = ClusterManager(self._client, "/home/test/my_cluster") runner.start() self_member = manager.add_member(self._self_instance) # 'self_instance' becomes the master. manager.promote_member(self_member) runner.promoted.wait(1) another_member = manager.add_member(ServiceInstance(Endpoint("another_host", 10000))) # This demotes 'self_instance', which should cause runner to stop. manager.promote_member(another_member) assert deadline(runner.join, Amount(1, Time.SECONDS))
def test_existing_zk(self): """ ClusterManager needs to be able to recover from an existing ZK group for scheduler failover. """ manager = ClusterManager(self.client, "/home/my_cluster") instance1 = ServiceInstance(Endpoint("host1", 10000)) member1 = manager.add_member(instance1) instance2 = ServiceInstance(Endpoint("host2", 10000)) member2 = manager.add_member(instance2) assert self.storage.paths["/home/my_cluster/slaves/member_0000000000"]["data"] == ServiceInstance.pack( instance1 ) assert self.storage.paths["/home/my_cluster/slaves/member_0000000001"]["data"] == ServiceInstance.pack( instance2 ) manager.promote_member(member1) # Test the new ClusterManager. manager2 = ClusterManager(self.client, "/home/my_cluster") assert len(manager2._cluster.members) == 2 assert member1 in manager2._cluster.members assert member2 in manager2._cluster.members assert manager2._cluster.members[member1] == ServiceInstance.pack(instance1)
def test_add_member(self): manager = ClusterManager(self.client, "/home/my_cluster") instance1 = ServiceInstance(Endpoint("host1", 10000)) member1 = manager.add_member(instance1) assert member1 == manager.add_member( instance1) # Second insertion is ignored. instance2 = ServiceInstance(Endpoint("host2", 10000)) manager.add_member(instance2) assert len(manager._cluster.members) == 2 assert (self.storage.paths["/home/my_cluster/slaves/member_0000000000"] ["data"] == ServiceInstance.pack(instance1)) assert (self.storage.paths["/home/my_cluster/slaves/member_0000000001"] ["data"] == ServiceInstance.pack(instance2))
def test_remove_cluster(self): manager = ClusterManager(self.client, "/home/my_cluster") instance1 = ServiceInstance(Endpoint("host1", 10000)) member1 = manager.add_member(instance1) instance2 = ServiceInstance(Endpoint("host2", 10000)) member2 = manager.add_member(instance2) manager.promote_member(member1) with pytest.raises(ClusterManager.Error): manager.delete_cluster() manager.remove_member(member1) manager.remove_member(member2) manager.delete_cluster() assert "/home/my_cluster" not in self.storage.paths
def test_remove_member(self): manager = ClusterManager(self.client, "/home/my_cluster") instance = ServiceInstance(Endpoint("host", 10000)) member = manager.add_member(instance) assert manager.remove_member(member) assert not manager.remove_member( member) # The second deletion is ignored. assert "/home/my_cluster/master/member_0000000000" not in self.storage.paths
def test_service_instance_to_json(): json = """{ "additionalEndpoints": { "aurora": { "host": "hostname", "inet6": "2001:db8:1234:ffff:ffff:ffff:ffff:ffff", "port": 22 }, "health": { "host": "hostname", "inet": "1.2.3.4", "port": 23 }, "http": { "host": "hostname", "inet": "1.2.3.4", "inet6": "2001:db8:1234:ffff:ffff:ffff:ffff:ffff", "port": 23 } }, "serviceEndpoint": { "host": "hostname", "port": 24 }, "shard": 1, "status": "ALIVE" }""" service_instance = ServiceInstance( Endpoint("hostname", 24), { "aurora": Endpoint("hostname", 22, "1.2.3.4"), "health": Endpoint("hostname", 23, None, "2001:db8:1234:ffff:ffff:ffff:ffff:ffff"), "http": Endpoint("hostname", 23, "1.2.3.4", "2001:db8:1234:ffff:ffff:ffff:ffff:ffff"), }, 'ALIVE', 1) assert ServiceInstance.unpack(json) == service_instance assert ServiceInstance.unpack( ServiceInstance.pack(service_instance)) == service_instance
def test_url_when_not_connected_and_cluster_has_no_proxy_url(scheme): host = 'some-host.example.com' port = 31181 mock_zk = mock.create_autospec(spec=TwitterKazooClient, instance=True) service_json = '''{ "additionalEndpoints": { "%(scheme)s": { "host": "%(host)s", "port": %(port)d } }, "serviceEndpoint": { "host": "%(host)s", "port": %(port)d }, "shard": 0, "status": "ALIVE" }''' % dict(host=host, port=port, scheme=scheme) service_endpoints = [ServiceInstance.unpack(service_json)] def make_mock_client(proxy_url): client = scheduler_client.ZookeeperSchedulerClient( Cluster(proxy_url=proxy_url), auth=None, user_agent='Some-User-Agent', _deadline=lambda x, **kws: x()) client.get_scheduler_serverset = mock.MagicMock( return_value=(mock_zk, service_endpoints)) client.SERVERSET_TIMEOUT = Amount(0, Time.SECONDS) client._connect_scheduler = mock.MagicMock() return client client = make_mock_client(proxy_url=None) assert client.url == '%s://%s:%d' % (scheme, host, port) assert client.url == client.raw_url client._connect_scheduler.assert_has_calls([]) client = make_mock_client(proxy_url='https://scheduler.proxy') assert client.url == 'https://scheduler.proxy' assert client.raw_url == '%s://%s:%d' % (scheme, host, port) client._connect_scheduler.assert_has_calls([]) client = make_mock_client(proxy_url=None) client.get_thrift_client() assert client.url == '%s://%s:%d' % (scheme, host, port) client._connect_scheduler.assert_has_calls( [mock.call('%s://%s:%d/api' % (scheme, host, port))]) client._connect_scheduler.reset_mock() client.get_thrift_client() client._connect_scheduler.assert_has_calls([])
def test_service_instance_to_json(): json = """{ "additionalEndpoints": { "aurora": { "host": "hostname", "inet6": "2001:db8:1234:ffff:ffff:ffff:ffff:ffff", "port": 22 }, "health": { "host": "hostname", "inet": "1.2.3.4", "port": 23 }, "http": { "host": "hostname", "inet": "1.2.3.4", "inet6": "2001:db8:1234:ffff:ffff:ffff:ffff:ffff", "port": 23 } }, "serviceEndpoint": { "host": "hostname", "port": 24 }, "shard": 1, "status": "ALIVE" }""" service_instance = ServiceInstance( Endpoint("hostname", 24), {"aurora": Endpoint("hostname", 22, "1.2.3.4"), "health": Endpoint("hostname", 23, None, "2001:db8:1234:ffff:ffff:ffff:ffff:ffff"), "http": Endpoint("hostname", 23, "1.2.3.4", "2001:db8:1234:ffff:ffff:ffff:ffff:ffff"), }, 'ALIVE', 1 ) assert ServiceInstance.unpack(json) == service_instance assert ServiceInstance.unpack(ServiceInstance.pack(service_instance)) == service_instance
def test_url_when_not_connected_and_cluster_has_no_proxy_url(scheme): host = 'some-host.example.com' port = 31181 mock_zk = mock.create_autospec(spec=TwitterKazooClient, instance=True) service_json = '''{ "additionalEndpoints": { "%(scheme)s": { "host": "%(host)s", "port": %(port)d } }, "serviceEndpoint": { "host": "%(host)s", "port": %(port)d }, "shard": 0, "status": "ALIVE" }''' % dict(host=host, port=port, scheme=scheme) service_endpoints = [ServiceInstance.unpack(service_json)] def make_mock_client(proxy_url): client = scheduler_client.ZookeeperSchedulerClient( Cluster(proxy_url=proxy_url), auth=None, user_agent='Some-User-Agent', _deadline=lambda x, **kws: x()) client.get_scheduler_serverset = mock.MagicMock(return_value=(mock_zk, service_endpoints)) client.SERVERSET_TIMEOUT = Amount(0, Time.SECONDS) client._connect_scheduler = mock.MagicMock() return client client = make_mock_client(proxy_url=None) assert client.url == '%s://%s:%d' % (scheme, host, port) assert client.url == client.raw_url client._connect_scheduler.assert_has_calls([]) client = make_mock_client(proxy_url='https://scheduler.proxy') assert client.url == 'https://scheduler.proxy' assert client.raw_url == '%s://%s:%d' % (scheme, host, port) client._connect_scheduler.assert_has_calls([]) client = make_mock_client(proxy_url=None) client.get_thrift_client() assert client.url == '%s://%s:%d' % (scheme, host, port) client._connect_scheduler.assert_has_calls([mock.call('%s://%s:%d/api' % (scheme, host, port))]) client._connect_scheduler.reset_mock() client.get_thrift_client() client._connect_scheduler.assert_has_calls([])
def _swap(self, master, master_content): i_was_master = self._self_content and self._master_content == self._self_content self._master, self._master_content = master, master_content i_am_master = self._self_content and self._master_content == self._self_content # Invoke callbacks accordingly. # NOTE: No callbacks are invoked if there is currently no master and 'self_instance' wasn't the # master. if i_was_master and not i_am_master: self._demotion_callback() elif not i_was_master and i_am_master: self._promotion_callback() elif not i_was_master and not i_am_master and master: assert master_content self._master_callback(ServiceInstance.unpack(master_content))
def test_invalid_znode(self): instance1 = ServiceInstance(Endpoint("host1", 10000)) handler1 = CallbackHandler() listener1 = ClusterListener(self.client, "/home/my_cluster", instance1, handler1.promotion_callback, handler1.demotion_callback, handler1.master_callback) listener1.start() self.client.ensure_path("/home/my_cluster/master") self.client.create("/home/my_cluster/master/member_", "Invalid Data", sequence=True) # Invalid ZNode data translates into a 'None' return. assert handler1.detected.get(True, 1) is None
def from_task(self, task, sandbox): data = json.loads(task.data) cluster_name, port, zk_url = data['cluster'], data['port'], data[ 'zk_url'] _, servers, path = zookeeper.parse(zk_url) zk_client = FakeClient() zk_client.start() self_instance = ServiceInstance( Endpoint(socket.gethostbyname(socket.gethostname()), port)) task_control = self._task_control_provider.from_task(task, sandbox) return MysosTaskRunner(self_instance, zk_client, posixpath.join(path, cluster_name), NoopPackageInstaller(), task_control, Fake())
def mock_get_serverset(*args, **kwargs): service_json = '''{ "additionalEndpoints": { "http": { "host": "%s", "port": %d } }, "serviceEndpoint": { "host": "%s", "port": %d }, "shard": 0, "status": "ALIVE" }''' % (host, port, host, port) return mock_zk, [ServiceInstance.unpack(service_json)]
def _service_instance(vals): json = '''{ "additionalEndpoints": { "aurora": { "host": "smfd-akb-%d-sr1.devel.twitter.com", "port": 31181 }, "health": { "host": "smfd-akb-%d-sr1.devel.twitter.com", "port": 31181 } }, "serviceEndpoint": { "host": "smfd-akb-%d-sr1.devel.twitter.com", "port": 31181 }, "shard": %d, "status": "ALIVE" }''' % vals return ServiceInstance.unpack(json)
def from_task(self, task, sandbox): data = json.loads(task.data) cluster_name, host, port, zk_url = data['cluster'], data['host'], data[ 'port'], data['zk_url'] _, servers, path = parse(zk_url) kazoo = KazooClient(servers) kazoo.start() self_instance = ServiceInstance(Endpoint(host, port)) try: task_control = self._task_control_provider.from_task(task, sandbox) installer = self._installer_provider.from_task(task, sandbox) backup_store = self._backup_store_provider.from_task(task, sandbox) except (TaskControl.Error, PackageInstaller.Error) as e: raise TaskError(e.message) state_manager = StateManager(sandbox, backup_store) return MysosTaskRunner(self_instance, kazoo, get_cluster_path(path, cluster_name), installer, task_control, state_manager)
def test_reparent(self): task_control = FakeTaskControl() runner = MysosTaskRunner( self._self_instance, self._client, "/home/test/my_cluster", NoopPackageInstaller(), task_control, self._state_manager) manager = ClusterManager(self._client, "/home/test/my_cluster") runner.start() # Promote another instance. master = ServiceInstance(Endpoint("another_host", 10000)) another_member = manager.add_member(master) manager.promote_member(another_member) assert runner.master.get(True, 1) == master assert runner.stop() assert deadline(runner.join, Amount(1, Time.SECONDS))
def test_promote_member(self): manager = ClusterManager(self.client, "/home/my_cluster") instance = ServiceInstance(Endpoint("host", 10000)) member = manager.add_member(instance) assert manager.promote_member(member) assert not manager.promote_member(member) # The 2nd promotion is a no-op. assert self.storage.paths["/home/my_cluster/master/member_0000000000"]["data"] == ServiceInstance.pack(instance)
def test_add_member(self): manager = ClusterManager(self.client, "/home/my_cluster") instance1 = ServiceInstance(Endpoint("host1", 10000)) member1 = manager.add_member(instance1) assert member1 == manager.add_member(instance1) # Second insertion is ignored. instance2 = ServiceInstance(Endpoint("host2", 10000)) manager.add_member(instance2) assert len(manager._cluster.members) == 2 assert self.storage.paths["/home/my_cluster/slaves/member_0000000000"]["data"] == ServiceInstance.pack( instance1 ) assert self.storage.paths["/home/my_cluster/slaves/member_0000000001"]["data"] == ServiceInstance.pack( instance2 )
def test_callbacks(self): manager = ClusterManager(self.client, "/home/my_cluster") # Set up 2 listeners. instance1 = ServiceInstance(Endpoint("host1", 10000)) handler1 = CallbackHandler() listener1 = ClusterListener( self.client, "/home/my_cluster", instance1, handler1.promotion_callback, handler1.demotion_callback, handler1.master_callback, handler1.termination_callback, ) listener1.start() member1 = manager.add_member(instance1) instance2 = ServiceInstance(Endpoint("host2", 10000)) handler2 = CallbackHandler() listener2 = ClusterListener( self.client, "/home/my_cluster", instance2, handler2.promotion_callback, handler2.demotion_callback, handler2.master_callback, ) listener2.start() member2 = manager.add_member(instance2) # Test promotion. manager.promote_member(member1) assert handler1.promoted.wait(1) assert handler2.detected.get(True, 1) == instance1 assert self.storage.paths["/home/my_cluster/master/member_0000000000"]["data"] == ServiceInstance.pack( instance1 ) assert self.storage.paths["/home/my_cluster/slaves/member_0000000001"]["data"] == ServiceInstance.pack( instance2 ) manager.promote_member(member2) assert handler1.demoted.wait(1) assert handler2.promoted.wait(1) assert self.storage.paths["/home/my_cluster/master/member_0000000001"]["data"] == ServiceInstance.pack( instance2 ) assert "/home/my_cluster/master/member_0000000000" not in self.storage.paths manager.remove_member(member2) assert handler2.demoted.wait(1) # Test removing cluster. manager.remove_member(member1) manager.delete_cluster() assert handler1.terminated.wait(1)
def setUp(self): self._storage = FakeStorage(SequentialThreadingHandler()) self._client = FakeClient(storage=self._storage) self._client.start() self._self_instance = ServiceInstance(Endpoint("host", 10000)) self._state_manager = FakeStateManager()
def status_update(self, status): """ Handle the status update for a task of this cluster. NOTE: Duplicate status updates may be handled by either the same scheduler instance or a new instance with the restored state. """ with self._lock: task_id = status.task_id.value if task_id not in self._cluster.tasks: log.warn("Ignoring status update for unknown task %s" % task_id) return task = self._cluster.tasks[task_id] previous_state = task.state # We don't want to ignore a duplicate update if the previous one was not successfully handled. # Therefore, we should not checkpoint the status change until we have finished all operations. if previous_state == status.state: log.info('Ignoring duplicate status update %s for task %s' % (mesos_pb2.TaskState.Name(status.state), task_id)) return if is_terminal(previous_state): log.info( 'Ignoring status update %s for task %s as it is in terminal state %s' % (mesos_pb2.TaskState.Name(status.state), task_id, mesos_pb2.TaskState.Name(previous_state))) return log.info('Updating state of task %s of cluster %s from %s to %s' % (status.task_id.value, self.cluster_name, mesos_pb2.TaskState.Name(previous_state), mesos_pb2.TaskState.Name(status.state))) task.state = status.state if status.state == mesos_pb2.TASK_RUNNING: # Register this cluster member. endpoint = Endpoint(self._cluster.tasks[task_id].hostname, self._cluster.tasks[task_id].port) # If the scheduler fails over after ZK is updated but before the state change is # checkpointed, it will receive the same status update again and try to publish a duplicate # member to ZK. ClusterManager.add_member() is idempotent and doesn't update ZK in this # case. member_id = self._cluster_manager.add_member( ServiceInstance(endpoint)) log.info('Added %s (member id=%s) to cluster %s' % (endpoint, member_id, self.cluster_name)) self._cluster.members[task_id] = member_id # Checkpoint the status update here. It's OK if the elector fails to launch later because # the new scheduler instance will retry based on the fact that there are running instances # of the cluster but no master. self._state_provider.dump_cluster_state(self._cluster) # If MySQL master is already elected for this cluster don't bother adding it to the elector. if self._cluster.master_id: log.info( "MySQL slave task %s on %s started after a master is already elected for cluster %s" % (task_id, endpoint.host, self.cluster_name)) return if not self._elector: self._elector = self._new_elector() # Add current slaves. for t in self._cluster.running_tasks: self._elector.add_slave(t.task_id, t.mesos_slave_id) self._elector.start() else: self._elector.add_slave(task_id, status.slave_id.value) elif status.state == mesos_pb2.TASK_FINISHED: raise self.Error( "Task %s is in unexpected state %s with message '%s'" % (status.task_id.value, mesos_pb2.TaskState.Name(status.state), status.message)) elif is_terminal(status.state): if status.state == mesos_pb2.TASK_KILLED: log.info("Task %s was successfully killed" % status.task_id.value) else: log.error( "Task %s is now in terminal state %s with message '%s'" % (status.task_id.value, mesos_pb2.TaskState.Name( status.state), status.message)) del self._cluster.tasks[task_id] if task_id in self._cluster.members: member_id = self._cluster.members[task_id] del self._cluster.members[task_id] # If the scheduler fails over after ZK is updated but before its result is persisted, it # will receive the same status update and try to remove the non-existent member. # Removing a non-existent member is a no-op for ClusterManager.remove_member(). # Note that if the order is reversed, the scheduler will fail to clean up the orphan ZK # entry. self._cluster_manager.remove_member(member_id) if member_id == self._cluster.master_id: self._cluster.master_id = None log.info( "Master of cluster %s has terminated. Restarting election" % self.cluster_name) assert not self._elector, "Election must not be running since there is a current master" self._elector = self._new_elector() # Add current slaves after removing the terminated task. for t in self._cluster.running_tasks: self._elector.add_slave(t.task_id, t.mesos_slave_id) self._elector.start() else: # It will be rescheduled next time the launcher is given an offer. log.info("Slave %s of cluster %s has terminated" % (task_id, self.cluster_name)) else: assert previous_state != mesos_pb2.TASK_RUNNING, ( "Task must exist in ClusterManager if it was running") log.warn("Slave %s of cluster %s failed to start running" % (task_id, self.cluster_name)) if self.terminated: log.info("Shutting down launcher for cluster %s" % self.cluster_name) self._shutdown() return # Finally, checkpoint the status update. self._state_provider.dump_cluster_state(self._cluster) log.info( "Checkpointed the status update for task %s of cluster %s" % (task_id, self.cluster_name))