def setUp(self): self._driver = FakeDriver() self._storage = FakeStorage(SequentialThreadingHandler()) self._zk_client = FakeClient(storage=self._storage) self._zk_client.start() self._framework_id = mesos_pb2.FrameworkID() self._framework_id.value = "framework_id_0" self._offer = mesos_pb2.Offer() self._offer.id.value = "offer_id_0" self._offer.framework_id.value = self._framework_id.value self._offer.slave_id.value = "slave_id_0" self._offer.hostname = "localhost" resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000, 10001, 10002])) self._offer.resources.extend(resources) self._framework_user = "******" self._zk_url = "zk://host/mysos/test" self._cluster = MySQLCluster("cluster0", "user", "pass", 3) self._tmpdir = tempfile.mkdtemp() self._state_provider = LocalStateProvider(self._tmpdir) framework_info = mesos_pb2.FrameworkInfo(user=getpass.getuser(), name="mysos", checkpoint=False) self._state = Scheduler(framework_info)
def test_scheduler_runs(): """ Verifies that the scheduler successfully launches 3 "no-op" MySQL tasks. NOTE: Due to the limitation of zake the scheduler's ZK operations are not propagated to executors in separate processes but they are unit-tested separately. """ import mesos.native # Make sure fake_mysos_executor.pex is available to be fetched by Mesos slave. assert os.path.isfile('dist/fake_mysos_executor.pex') storage = FakeStorage(SequentialThreadingHandler()) zk_client = FakeClient(storage=storage) zk_client.start() zk_url = "zk://fake_host/home/mysos/clusters" cluster_name = "test_cluster" num_nodes = 3 state_provider = LocalStateProvider(safe_mkdtemp()) framework_info = FrameworkInfo( user=getpass.getuser(), name="mysos", checkpoint=False) state = Scheduler(framework_info) scheduler = MysosScheduler( state, state_provider, getpass.getuser(), os.path.abspath("dist/fake_mysos_executor.pex"), "./fake_mysos_executor.pex", zk_client, zk_url, Amount(40, Time.SECONDS), "/fakepath", gen_encryption_key()) scheduler_driver = mesos.native.MesosSchedulerDriver( scheduler, framework_info, "local") scheduler_driver.start() # Wait until the scheduler is connected and becomes available. assert scheduler.connected.wait(30) scheduler.create_cluster(cluster_name, "mysql_user", num_nodes) # A slave is promoted to be the master. deadline( lambda: wait_for_master( get_cluster_path(posixpath.join(zk_url, 'discover'), cluster_name), zk_client), Amount(40, Time.SECONDS)) assert scheduler_driver.stop() == DRIVER_STOPPED
def test_invalid_arguments(self): client = FakeClient() client.start() manager = ClusterManager(client, "/home/my_cluster") with pytest.raises(ValueError) as e: manager.promote_member("123") assert e.value.message == "Invalid member_id: 123"
def test_invalid_arguments(self): client = FakeClient() client.start() manager = ClusterManager(client, "/home/my_cluster") with pytest.raises(ValueError) as e: manager.promote_member("123") assert e.value.message == 'Invalid member_id: 123'
def setup(self, request): self._driver = FakeDriver() self._storage = FakeStorage(SequentialThreadingHandler()) self._zk_client = FakeClient(storage=self._storage) self._zk_client.start() self._offer = mesos_pb2.Offer() self._offer.id.value = "offer_id_0" self._offer.framework_id.value = "framework_id_0" self._offer.slave_id.value = "slave_id_0" self._offer.hostname = "localhost" # Enough memory and ports to fit three tasks. resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000, 10001, 10002])) self._offer.resources.extend(resources) self._framework_user = "******" # Some tests use the default launcher; some don't. self._zk_url = "zk://host/mysos/test" self._cluster = MySQLCluster("cluster0", "user", "pass", 3) # Construct the state provider based on the test parameter. if request.param == LocalStateProvider: tmpdir = tempfile.mkdtemp() self._state_provider = LocalStateProvider(tmpdir) request.addfinalizer(lambda: shutil.rmtree(tmpdir, True) ) # Clean up after ourselves. elif request.param == ZooKeeperStateProvider: self._state_provider = ZooKeeperStateProvider( self._zk_client, "/mysos/test") self._launcher = MySQLClusterLauncher( self._driver, self._cluster, self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", query_interval=Amount(150, Time.MILLISECONDS)) # Short interval. self._elected = threading.Event() self._launchers = [self._launcher] # See teardown(). request.addfinalizer(self.teardown)
def test_scheduler_runs(): """ Verifies that the scheduler successfully launches 3 "no-op" MySQL tasks. NOTE: Due to the limitation of zake the scheduler's ZK operations are not propagated to executors in separate processes but they are unit-tested separately. """ import mesos.native # Make sure fake_mysos_executor.pex is available to be fetched by Mesos slave. assert os.path.isfile('dist/fake_mysos_executor.pex') storage = FakeStorage(SequentialThreadingHandler()) zk_client = FakeClient(storage=storage) zk_client.start() zk_url = "zk://fake_host/home/mysos/clusters" cluster_name = "test_cluster" num_nodes = 3 state_provider = LocalStateProvider(safe_mkdtemp()) framework_info = FrameworkInfo(user=getpass.getuser(), name="mysos", checkpoint=False) state = Scheduler(framework_info) scheduler = MysosScheduler(state, state_provider, getpass.getuser(), os.path.abspath("dist/fake_mysos_executor.pex"), "./fake_mysos_executor.pex", zk_client, zk_url, Amount(40, Time.SECONDS), "/fakepath", gen_encryption_key()) scheduler_driver = mesos.native.MesosSchedulerDriver( scheduler, framework_info, "local") scheduler_driver.start() # Wait until the scheduler is connected and becomes available. assert scheduler.connected.wait(30) scheduler.create_cluster(cluster_name, "mysql_user", num_nodes) # A slave is promoted to be the master. deadline( lambda: wait_for_master( get_cluster_path(posixpath.join(zk_url, 'discover'), cluster_name), zk_client), Amount(40, Time.SECONDS)) assert scheduler_driver.stop() == DRIVER_STOPPED
def setUp(self): self._driver = FakeDriver() self._storage = FakeStorage(SequentialThreadingHandler()) self._zk_client = FakeClient(storage=self._storage) self._zk_client.start() self._framework_id = mesos_pb2.FrameworkID() self._framework_id.value = "framework_id_0" self._offer = mesos_pb2.Offer() self._offer.id.value = "offer_id_0" self._offer.framework_id.value = self._framework_id.value self._offer.slave_id.value = "slave_id_0" self._offer.hostname = "localhost" resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000, 10001, 10002])) self._offer.resources.extend(resources) self._framework_user = "******" self._zk_url = "zk://host/mysos/test" self._cluster = MySQLCluster("cluster0", "user", "pass", 3) self._tmpdir = tempfile.mkdtemp() self._state_provider = LocalStateProvider(self._tmpdir) framework_info = mesos_pb2.FrameworkInfo( user=getpass.getuser(), name="mysos", checkpoint=False) self._state = Scheduler(framework_info)
def from_task(self, task, sandbox): data = json.loads(task.data) cluster_name, port, zk_url = data['cluster'], data['port'], data[ 'zk_url'] _, servers, path = zookeeper.parse(zk_url) zk_client = FakeClient() zk_client.start() self_instance = ServiceInstance( Endpoint(socket.gethostbyname(socket.gethostname()), port)) task_control = self._task_control_provider.from_task(task, sandbox) return MysosTaskRunner(self_instance, zk_client, posixpath.join(path, cluster_name), NoopPackageInstaller(), task_control, Fake())
def from_task(self, task, sandbox): data = json.loads(task.data) cluster_name, port, zk_url = data['cluster'], data['port'], data['zk_url'] _, servers, path = zookeeper.parse(zk_url) zk_client = FakeClient() zk_client.start() self_instance = ServiceInstance(Endpoint(socket.gethostbyname(socket.gethostname()), port)) task_control = self._task_control_provider.from_task(task, sandbox) return MysosTaskRunner( self_instance, zk_client, posixpath.join(path, cluster_name), NoopPackageInstaller(), task_control, Fake())
def setup(self, request): self._driver = FakeDriver() self._storage = FakeStorage(SequentialThreadingHandler()) self._zk_client = FakeClient(storage=self._storage) self._zk_client.start() self._offer = mesos_pb2.Offer() self._offer.id.value = "offer_id_0" self._offer.framework_id.value = "framework_id_0" self._offer.slave_id.value = "slave_id_0" self._offer.hostname = "localhost" # Enough memory and ports to fit three tasks. resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000, 10001, 10002])) self._offer.resources.extend(resources) self._framework_user = "******" # Some tests use the default launcher; some don't. self._zk_url = "zk://host/mysos/test" self._scheduler_key = gen_encryption_key() self._password_box = PasswordBox(self._scheduler_key) self._cluster = MySQLCluster("cluster0", "user", self._password_box.encrypt("pass"), 3) # Construct the state provider based on the test parameter. if request.param == LocalStateProvider: tmpdir = tempfile.mkdtemp() self._state_provider = LocalStateProvider(tmpdir) request.addfinalizer(lambda: shutil.rmtree(tmpdir, True)) # Clean up after ourselves. elif request.param == ZooKeeperStateProvider: self._state_provider = ZooKeeperStateProvider(self._zk_client, "/mysos/test") self._launcher = MySQLClusterLauncher( self._driver, self._cluster, self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", self._scheduler_key, query_interval=Amount(150, Time.MILLISECONDS)) # Short interval. self._elected = threading.Event() self._launchers = [self._launcher] # See teardown(). request.addfinalizer(self.teardown)
def test_return_immediately_when_blocking_on_empty_queue_and_available_task_comes_in( self ): client = FakeClient() client.start() queue = ZKDelayDeadlineQueue(client, "/") """ Set up several threads waiting for work; insert several pieces of work; make sure each thread finishes. """ tpe = ThreadPoolExecutor() def time_get(): queue = ZKDelayDeadlineQueue(client, "/") start_time = time.time() with queue.get(timeout=1.0) as si: pass return time.time() - start_time, si fut1 = tpe.submit(time_get) fut2 = tpe.submit(time_get) fut3 = tpe.submit(time_get) begin = time.time() si1 = make_si(wait_until=begin, bounce_by=begin) queue.put(si1) si2 = make_si(wait_until=begin + 0.01, bounce_by=begin + 0.01) queue.put(si2) si3 = make_si(wait_until=begin + 0.02, bounce_by=begin + 0.02) queue.put(si3) times = sorted([x.result(timeout=2.0) for x in [fut1, fut2, fut3]]) assert times[0][0] < 0.011 assert times[0][1] == si1 assert 0.009 < times[1][0] < 0.021 assert times[1][1] == si2 assert 0.019 < times[2][0] < 0.031 assert times[2][1] == si3
async def test_functional(): """Test as much of the whole stack as we can.""" config = { 'deadman': { 'plugins': 'zgres#zookeeper\nzgres#apt\nzgres#ec2-snapshot\nzgres#ec2\nzgres#follow-the-leader\nzgres#select-furthest-ahead-replica', }, 'apt': { 'postgresql_cluster_name': 'main', 'postgresql_version': '9.5', }, } zk = FakeClient() with mock.patch('zgres.zookeeper.KazooClient') as KazooClient, \ mock.patch('zgres.ec2.boto.utils.get_instance_metadata'): KazooClient.return_value = zk app = deadman.App(config)
class TestScheduler(unittest.TestCase): def setUp(self): self._driver = FakeDriver() self._storage = FakeStorage(SequentialThreadingHandler()) self._zk_client = FakeClient(storage=self._storage) self._zk_client.start() self._framework_id = mesos_pb2.FrameworkID() self._framework_id.value = "framework_id_0" self._offer = mesos_pb2.Offer() self._offer.id.value = "offer_id_0" self._offer.framework_id.value = self._framework_id.value self._offer.slave_id.value = "slave_id_0" self._offer.hostname = "localhost" resources = create_resources(cpus=DEFAULT_TASK_CPUS * 3, mem=DEFAULT_TASK_MEM * 3, disk=DEFAULT_TASK_DISK * 3, ports=set([10000, 10001, 10002])) self._offer.resources.extend(resources) self._framework_user = "******" self._zk_url = "zk://host/mysos/test" self._cluster = MySQLCluster("cluster0", "user", "pass", 3, DEFAULT_TASK_CPUS, DEFAULT_TASK_MEM, DEFAULT_TASK_DISK) self._tmpdir = tempfile.mkdtemp() self._state_provider = LocalStateProvider(self._tmpdir) framework_info = mesos_pb2.FrameworkInfo(user=getpass.getuser(), name="mysos", checkpoint=False) self._state = Scheduler(framework_info) def tearDown(self): shutil.rmtree(self._tmpdir, True) # Clean up after ourselves. def test_scheduler_recovery(self): scheduler_key = gen_encryption_key() scheduler1 = MysosScheduler(self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", scheduler_key) scheduler1.registered(self._driver, self._framework_id, object()) scheduler1.create_cluster("cluster1", "mysql_user", 3) scheduler1.resourceOffers(self._driver, [self._offer]) # One task is launched for one offer. assert len(scheduler1._launchers["cluster1"]._cluster.tasks) == 1 with pytest.raises(MysosScheduler.ClusterExists): scheduler1.create_cluster("cluster1", "mysql_user", 3) # FrameworkID should have been persisted. self._state = self._state_provider.load_scheduler_state() assert self._state.framework_info.id.value == self._framework_id.value # Simulate restart. scheduler2 = MysosScheduler(self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", scheduler_key) # Scheduler always receives registered() with the same FrameworkID after failover. scheduler2.registered(self._driver, self._framework_id, object()) assert len(scheduler2._launchers) == 1 assert scheduler2._launchers["cluster1"].cluster_name == "cluster1" # Scheduler has recovered the cluster so it doesn't accept another of the same name. with pytest.raises(MysosScheduler.ClusterExists): scheduler2.create_cluster("cluster1", "mysql_user", 3) def test_scheduler_recovery_failure_before_launch(self): scheduler_key = gen_encryption_key() scheduler1 = MysosScheduler(self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", scheduler_key) scheduler1.registered(self._driver, self._framework_id, object()) _, password = scheduler1.create_cluster("cluster1", "mysql_user", 3) # Simulate restart before the task is successfully launched. scheduler2 = MysosScheduler(self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", scheduler_key) assert len(scheduler2._launchers) == 0 # No launchers are recovered. # Scheduler always receives registered() with the same FrameworkID after failover. scheduler2.registered(self._driver, self._framework_id, object()) assert len(scheduler2._launchers) == 1 assert scheduler2._launchers["cluster1"].cluster_name == "cluster1" password_box = PasswordBox(scheduler_key) assert password_box.match( password, scheduler2._launchers["cluster1"]._cluster.encrypted_password) # Now offer the resources for this task. scheduler2.resourceOffers(self._driver, [self._offer]) # One task is launched for the offer. assert len( scheduler2._launchers["cluster1"]._cluster.active_tasks) == 1 # Scheduler has recovered the cluster so it doesn't accept another of the same name. with pytest.raises(MysosScheduler.ClusterExists): scheduler2.create_cluster("cluster1", "mysql_user", 3) def test_incompatible_resource_role(self): scheduler1 = MysosScheduler( self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", gen_encryption_key(), framework_role='mysos' ) # Require 'mysos' but the resources are in '*'. scheduler1.registered(self._driver, self._framework_id, object()) scheduler1.create_cluster("cluster1", "mysql_user", 3) scheduler1.resourceOffers(self._driver, [self._offer]) assert "declineOffer" in self._driver.method_calls assert len(self._driver.method_calls["declineOffer"]) == 1 # [0][0][1]: [First declineOffer call][The positional args][The first positional arg], which is # a 'Filters' object. assert (self._driver.method_calls["declineOffer"][0][0][1]. refuse_seconds == INCOMPATIBLE_ROLE_OFFER_REFUSE_DURATION.as_( Time.SECONDS)) def test_scheduler_metrics(self): scheduler_key = gen_encryption_key() scheduler = MysosScheduler(self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", scheduler_key) RootMetrics().register_observable('scheduler', scheduler) scheduler.registered(self._driver, self._framework_id, object()) scheduler.create_cluster("cluster1", "mysql_user", 3, cluster_password='******') sample = RootMetrics().sample() assert sample['scheduler.cluster_count'] == 1 assert sample[ 'scheduler.total_requested_mem_mb'] == DEFAULT_TASK_MEM.as_( Data.MB) * 3 assert sample[ 'scheduler.total_requested_disk_mb'] == DEFAULT_TASK_DISK.as_( Data.MB) * 3 assert sample[ 'scheduler.total_requested_cpus'] == DEFAULT_TASK_CPUS * 3 scheduler.delete_cluster("cluster1", 'test_password') sample = RootMetrics().sample() assert sample['scheduler.cluster_count'] == 0 assert sample['scheduler.total_requested_mem_mb'] == 0 assert sample['scheduler.total_requested_disk_mb'] == 0 assert sample['scheduler.total_requested_cpus'] == 0 def test_scheduler_delete_empty_cluster(self): scheduler_key = gen_encryption_key() scheduler = MysosScheduler(self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", scheduler_key) scheduler.registered(self._driver, self._framework_id, object()) _, password = scheduler.create_cluster("cluster1", "mysql_user", 3) assert len(scheduler._launchers) == 1 # Deleting the cluster before any offer comes in for launching any task. scheduler.delete_cluster("cluster1", password) assert len(scheduler._launchers) == 0
class TestLauncher(object): @pytest.fixture(params=[LocalStateProvider, ZooKeeperStateProvider], autouse=True) def setup(self, request): self._driver = FakeDriver() self._storage = FakeStorage(SequentialThreadingHandler()) self._zk_client = FakeClient(storage=self._storage) self._zk_client.start() self._offer = mesos_pb2.Offer() self._offer.id.value = "offer_id_0" self._offer.framework_id.value = "framework_id_0" self._offer.slave_id.value = "slave_id_0" self._offer.hostname = "localhost" # Enough memory and ports to fit three tasks. resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000, 10001, 10002])) self._offer.resources.extend(resources) self._framework_user = "******" # Some tests use the default launcher; some don't. self._zk_url = "zk://host/mysos/test" self._scheduler_key = gen_encryption_key() self._password_box = PasswordBox(self._scheduler_key) self._cluster = MySQLCluster("cluster0", "user", self._password_box.encrypt("pass"), 3) # Construct the state provider based on the test parameter. if request.param == LocalStateProvider: tmpdir = tempfile.mkdtemp() self._state_provider = LocalStateProvider(tmpdir) request.addfinalizer(lambda: shutil.rmtree(tmpdir, True)) # Clean up after ourselves. elif request.param == ZooKeeperStateProvider: self._state_provider = ZooKeeperStateProvider(self._zk_client, "/mysos/test") self._launcher = MySQLClusterLauncher( self._driver, self._cluster, self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", self._scheduler_key, query_interval=Amount(150, Time.MILLISECONDS)) # Short interval. self._elected = threading.Event() self._launchers = [self._launcher] # See teardown(). request.addfinalizer(self.teardown) def teardown(self): for launcher in self._launchers: if launcher._elector: launcher._elector.abort() # Abort the thread even if the election is pending. launcher._elector.join() def test_launch_cluster_all_nodes_successful(self): for i in range(self._cluster.num_nodes): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes # No new tasks are launched. assert self._launcher.launch(self._offer)[0] is None assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes # All 3 nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING # Valid state. status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # Two slaves. assert len([x for x in self._storage.paths.keys() if x.startswith( "/mysos/test/cluster0/slaves/member_")]) == 2 def test_launch_cluster_insufficient_resources(self): """All but one slave in the slave are launched successfully.""" del self._offer.resources[:] resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000, 10001])) self._offer.resources.extend(resources) # There is one fewer port than required to launch the entire cluster. for i in range(self._cluster.num_nodes - 1): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes - 1 # The final task cannot get launched. assert self._launcher.launch(self._offer)[0] is None assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes - 1 # The two nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING # Valid state. status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes - 1): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # One slave. assert len([x for x in self._storage.paths.keys() if x.startswith( "/mysos/test/cluster0/slaves/member_")]) == 1 def test_two_launchers(self): """Two launchers share resources and launch their clusters successfully.""" launchers = [ MySQLClusterLauncher( self._driver, MySQLCluster("cluster0", "user0", self._password_box.encrypt("pass0"), 1), self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", self._scheduler_key), MySQLClusterLauncher( self._driver, MySQLCluster("cluster1", "user1", self._password_box.encrypt("pass1"), 2), self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", self._scheduler_key)] self._launchers.extend(launchers) resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000, 10001, 10002])) self._offer.resources.extend(resources) # Three nodes in total across two clusters. # Simulate the scheduler. for i in range(3): for launcher in launchers: task_id, remaining = launcher.launch(self._offer) if task_id: # Update the offer so other launchers will use its remaining resources. del self._offer.resources[:] self._offer.resources.extend(remaining) break tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == 3 def test_invalid_status_update(self): """Launcher raises an exception when an invalid status is received.""" self._cluster.num_nodes = 1 launcher = MySQLClusterLauncher( self._driver, self._cluster, self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", self._scheduler_key) self._launchers.append(launcher) resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000])) self._offer.resources.extend(resources) task_id, _ = launcher.launch(self._offer) assert task_id == "mysos-cluster0-0" tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes status = mesos_pb2.TaskStatus() status.task_id.value = task_id status.state = mesos_pb2.TASK_RUNNING # Valid state. launcher.status_update(status) status.state = mesos_pb2.TASK_FINISHED # An invalid state. with pytest.raises(MySQLClusterLauncher.Error): launcher.status_update(status) def test_terminal_status_update(self): """Launcher reacts to terminated task by launching a new one.""" self._cluster.num_nodes = 1 launcher = MySQLClusterLauncher( self._driver, self._cluster, self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(1, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", self._scheduler_key) self._launchers.append(launcher) resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000])) self._offer.resources.extend(resources) task_id, _ = launcher.launch(self._offer) assert task_id == "mysos-cluster0-0" launched = self._driver.method_calls["launchTasks"] assert len(launched) == self._cluster.num_nodes status = mesos_pb2.TaskStatus() status.task_id.value = task_id status.state = mesos_pb2.TASK_RUNNING launcher.status_update(status) assert len(launcher._cluster.running_tasks) == 1 status.state = mesos_pb2.TASK_LOST launcher.status_update(status) assert len(launcher._cluster.running_tasks) == 0 task_id, _ = launcher.launch(self._offer) assert task_id == "mysos-cluster0-1" launched = self._driver.method_calls["launchTasks"] # One task is relaunched to make up for the lost one. assert len(launched) == self._cluster.num_nodes + 1 def test_master_failover(self): for i in range(self._cluster.num_nodes): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes # All 3 nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) # No log positions queries are sent for the first epoch. assert "sendFrameworkMessage" not in self._driver.method_calls # Wait for the election to complete. deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # Now fail the master task. status.task_id.value = "mysos-cluster0-0" status.state = mesos_pb2.TASK_FAILED self._launcher.status_update(status) assert len(self._launcher._cluster.running_tasks) == 2 # Log positions queries are sent. self._launcher._elector._elect() assert len(self._driver.method_calls["sendFrameworkMessage"]) >= 2 for i in range(1, self._cluster.num_nodes): self._launcher.framework_message( "mysos-cluster0-%s" % i, self._offer.slave_id.value, json.dumps(dict(epoch=1, position=str(i)))) # Wait for the election to complete. deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The slave with the highest position is elected. assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths assert len(self._launcher._cluster.running_tasks) == 2 # When a new offer comes in, a new task is launched. del self._offer.resources[:] resources = create_resources(cpus=1, mem=512, ports=set([10000])) self._offer.resources.extend(resources) task_id, _ = self._launcher.launch(self._offer) assert task_id == "mysos-cluster0-3" launched = self._driver.method_calls["launchTasks"] # One task is relaunched to make up for the failed one. assert len(launched) == self._cluster.num_nodes + 1 def test_launcher_recovery_after_election_completed(self): # 1. Launch a cluster on the running launcher. for i in range(self._cluster.num_nodes): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes # No new tasks are launched. assert self._launcher.launch(self._offer)[0] is None assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes # All 3 nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # Two slaves. assert len([x for x in self._storage.paths.keys() if x.startswith( "/mysos/test/cluster0/slaves/member_")]) == 2 # 2. Recover the launcher. self._cluster = self._state_provider.load_cluster_state(self._cluster.name) self._launcher = MySQLClusterLauncher( self._driver, self._cluster, self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", self._scheduler_key, query_interval=Amount(150, Time.MILLISECONDS)) # Now fail the master task. status.task_id.value = "mysos-cluster0-0" status.state = mesos_pb2.TASK_FAILED self._launcher.status_update(status) for i in range(1, self._cluster.num_nodes): self._launcher.framework_message( "mysos-cluster0-%s" % i, self._offer.slave_id.value, json.dumps(dict(epoch=1, position=str(i)))) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The second slave has the larger position and is elected. assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths def test_launcher_recovery_before_election_completed(self): # 1. Launch a cluster on the running launcher. for i in range(self._cluster.num_nodes): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes # No new tasks are launched. assert self._launcher.launch(self._offer)[0] is None assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes # All 3 nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # Two slaves. assert len([x for x in self._storage.paths.keys() if x.startswith( "/mysos/test/cluster0/slaves/member_")]) == 2 # Now fail the master task which leads to re-election. status.task_id.value = "mysos-cluster0-0" status.state = mesos_pb2.TASK_FAILED self._launcher.status_update(status) # 2. Recover the launcher. self._cluster = self._state_provider.load_cluster_state(self._cluster.name) self._launcher = MySQLClusterLauncher( self._driver, self._cluster, self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", self._scheduler_key, query_interval=Amount(150, Time.MILLISECONDS)) for i in range(1, self._cluster.num_nodes): self._launcher.framework_message( "mysos-cluster0-%s" % i, self._offer.slave_id.value, json.dumps(dict(epoch=2, position=str(i)))) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The second slave has the larger position and is elected. assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths def test_launcher_kill(self): for i in range(self._cluster.num_nodes): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes # No new tasks are launched. assert self._launcher.launch(self._offer)[0] is None assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes # All 3 nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING # Valid state. status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # Two slaves. assert len([x for x in self._storage.paths.keys() if x.startswith( "/mysos/test/cluster0/slaves/member_")]) == 2 # Kill the cluster. with pytest.raises(MySQLClusterLauncher.PermissionError): self._launcher.kill("wrong_password") # Correct password. self._launcher.kill(self._password_box.decrypt(self._cluster.encrypted_password)) # All 3 nodes are successfully killed. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_KILLED status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) assert "/mysos/test/cluster0" not in self._storage.paths # ServerSets removed. assert not self._state_provider.load_cluster_state("cluster0") # State removed. def test_launcher_recovery_corrupted_password(self): # 1. Launch a single instance for a cluster on the running launcher. task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-0" # The task has successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING status.slave_id.value = self._offer.slave_id.value status.task_id.value = "mysos-cluster0-0" self._launcher.status_update(status) # 2. Recover the launcher. self._cluster = self._state_provider.load_cluster_state(self._cluster.name) self._cluster.encrypted_password = "******" # The corrupted password causes the launcher constructor to fail. with pytest.raises(ValueError): self._launcher = MySQLClusterLauncher( self._driver, self._cluster, self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", self._scheduler_key, query_interval=Amount(150, Time.MILLISECONDS))
class TestCluster(unittest.TestCase): def setUp(self): self.storage = FakeStorage(SequentialThreadingHandler()) self.client = FakeClient(storage=self.storage) self.client.start() def tearDown(self): self.client.stop() def test_add_member(self): manager = ClusterManager(self.client, "/home/my_cluster") instance1 = ServiceInstance(Endpoint("host1", 10000)) member1 = manager.add_member(instance1) assert member1 == manager.add_member( instance1) # Second insertion is ignored. instance2 = ServiceInstance(Endpoint("host2", 10000)) manager.add_member(instance2) assert len(manager._cluster.members) == 2 assert (self.storage.paths["/home/my_cluster/slaves/member_0000000000"] ["data"] == ServiceInstance.pack(instance1)) assert (self.storage.paths["/home/my_cluster/slaves/member_0000000001"] ["data"] == ServiceInstance.pack(instance2)) def test_promote_member(self): manager = ClusterManager(self.client, "/home/my_cluster") instance = ServiceInstance(Endpoint("host", 10000)) member = manager.add_member(instance) assert manager.promote_member(member) assert not manager.promote_member( member) # The 2nd promotion is a no-op. assert (self.storage.paths["/home/my_cluster/master/member_0000000000"] ["data"] == ServiceInstance.pack(instance)) def test_remove_member(self): manager = ClusterManager(self.client, "/home/my_cluster") instance = ServiceInstance(Endpoint("host", 10000)) member = manager.add_member(instance) assert manager.remove_member(member) assert not manager.remove_member( member) # The second deletion is ignored. assert "/home/my_cluster/master/member_0000000000" not in self.storage.paths def test_callbacks(self): manager = ClusterManager(self.client, "/home/my_cluster") # Set up 2 listeners. instance1 = ServiceInstance(Endpoint("host1", 10000)) handler1 = CallbackHandler() listener1 = ClusterListener(self.client, "/home/my_cluster", instance1, handler1.promotion_callback, handler1.demotion_callback, handler1.master_callback, handler1.termination_callback) listener1.start() member1 = manager.add_member(instance1) instance2 = ServiceInstance(Endpoint("host2", 10000)) handler2 = CallbackHandler() listener2 = ClusterListener(self.client, "/home/my_cluster", instance2, handler2.promotion_callback, handler2.demotion_callback, handler2.master_callback) listener2.start() member2 = manager.add_member(instance2) # Test promotion. manager.promote_member(member1) assert handler1.promoted.wait(1) assert handler2.detected.get(True, 1) == instance1 assert (self.storage.paths["/home/my_cluster/master/member_0000000000"] ["data"] == ServiceInstance.pack(instance1)) assert (self.storage.paths["/home/my_cluster/slaves/member_0000000001"] ["data"] == ServiceInstance.pack(instance2)) manager.promote_member(member2) assert handler1.demoted.wait(1) assert handler2.promoted.wait(1) assert (self.storage.paths["/home/my_cluster/master/member_0000000001"] ["data"] == ServiceInstance.pack(instance2)) assert "/home/my_cluster/master/member_0000000000" not in self.storage.paths manager.remove_member(member2) assert handler2.demoted.wait(1) # Test removing cluster. manager.remove_member(member1) manager.delete_cluster() assert handler1.terminated.wait(1) def test_invalid_arguments(self): client = FakeClient() client.start() manager = ClusterManager(client, "/home/my_cluster") with pytest.raises(ValueError) as e: manager.promote_member("123") assert e.value.message == 'Invalid member_id: 123' def test_invalid_znode(self): instance1 = ServiceInstance(Endpoint("host1", 10000)) handler1 = CallbackHandler() listener1 = ClusterListener(self.client, "/home/my_cluster", instance1, handler1.promotion_callback, handler1.demotion_callback, handler1.master_callback) listener1.start() self.client.ensure_path("/home/my_cluster/master") self.client.create("/home/my_cluster/master/member_", "Invalid Data", sequence=True) # Invalid ZNode data translates into a 'None' return. assert handler1.detected.get(True, 1) is None def test_existing_zk(self): """ ClusterManager needs to be able to recover from an existing ZK group for scheduler failover. """ manager = ClusterManager(self.client, "/home/my_cluster") instance1 = ServiceInstance(Endpoint("host1", 10000)) member1 = manager.add_member(instance1) instance2 = ServiceInstance(Endpoint("host2", 10000)) member2 = manager.add_member(instance2) assert (self.storage.paths["/home/my_cluster/slaves/member_0000000000"] ["data"] == ServiceInstance.pack(instance1)) assert (self.storage.paths["/home/my_cluster/slaves/member_0000000001"] ["data"] == ServiceInstance.pack(instance2)) manager.promote_member(member1) # Test the new ClusterManager. manager2 = ClusterManager(self.client, "/home/my_cluster") assert len(manager2._cluster.members) == 2 assert member1 in manager2._cluster.members assert member2 in manager2._cluster.members assert manager2._cluster.members[member1] == ServiceInstance.pack( instance1) def test_remove_cluster(self): manager = ClusterManager(self.client, "/home/my_cluster") instance1 = ServiceInstance(Endpoint("host1", 10000)) member1 = manager.add_member(instance1) instance2 = ServiceInstance(Endpoint("host2", 10000)) member2 = manager.add_member(instance2) manager.promote_member(member1) with pytest.raises(ClusterManager.Error): manager.delete_cluster() manager.remove_member(member1) manager.remove_member(member2) manager.delete_cluster() assert "/home/my_cluster" not in self.storage.paths
class TestZooKeeperStateProvider(unittest.TestCase): def setUp(self): self._storage = FakeStorage(SequentialThreadingHandler()) self._client = FakeClient(storage=self._storage) self._client.start() self._state_provider = ZooKeeperStateProvider(self._client, '/mysos') def tearDown(self): self._client.stop() def test_scheduler_state(self): expected = Scheduler(FrameworkInfo( user='******', name='test_fw_name', checkpoint=True)) expected.tasks = dict(taks1='cluster1', task2='cluster2') self._state_provider.dump_scheduler_state(expected) actual = self._state_provider.load_scheduler_state() assert expected.framework_info == actual.framework_info assert expected.tasks == actual.tasks def test_scheduler_state_errors(self): assert not self._state_provider.load_scheduler_state() # Not an error for scheduler state to be # not found. self._client.ensure_path("/mysos/state") self._client.create("/mysos/state/scheduler", cPickle.dumps(object())) with pytest.raises(StateProvider.Error): self._state_provider.load_scheduler_state() def test_cluster_state(self): expected = MySQLCluster( 'cluster1', 'cluster_user', 'cluster_password', 3, DEFAULT_TASK_CPUS, DEFAULT_TASK_MEM, DEFAULT_TASK_DISK) expected.tasks['task1'] = MySQLTask( 'cluster1', 'task1', 'slave1', 'host1', 10000) self._state_provider.dump_cluster_state(expected) actual = self._state_provider.load_cluster_state('cluster1') assert expected.user == actual.user assert isinstance(actual.num_nodes, int) assert expected.num_nodes == actual.num_nodes assert len(expected.tasks) == len(actual.tasks) assert expected.tasks['task1'].port == actual.tasks['task1'].port def test_cluster_state_errors(self): assert not self._state_provider.load_cluster_state('nonexistent') self._client.ensure_path("/mysos/state/clusters") self._client.create("/mysos/state/clusters/cluster1", cPickle.dumps(object())) with pytest.raises(StateProvider.Error): self._state_provider.load_cluster_state('cluster1')
class TestScheduler(unittest.TestCase): def setUp(self): self._driver = FakeDriver() self._storage = FakeStorage(SequentialThreadingHandler()) self._zk_client = FakeClient(storage=self._storage) self._zk_client.start() self._framework_id = mesos_pb2.FrameworkID() self._framework_id.value = "framework_id_0" self._offer = mesos_pb2.Offer() self._offer.id.value = "offer_id_0" self._offer.framework_id.value = self._framework_id.value self._offer.slave_id.value = "slave_id_0" self._offer.hostname = "localhost" resources = create_resources( cpus=DEFAULT_TASK_CPUS * 3, mem=DEFAULT_TASK_MEM * 3, disk=DEFAULT_TASK_DISK * 3, ports=set([10000, 10001, 10002])) self._offer.resources.extend(resources) self._framework_user = "******" self._zk_url = "zk://host/mysos/test" self._cluster = MySQLCluster( "cluster0", "user", "pass", 3, DEFAULT_TASK_CPUS, DEFAULT_TASK_MEM, DEFAULT_TASK_DISK) self._tmpdir = tempfile.mkdtemp() self._state_provider = LocalStateProvider(self._tmpdir) framework_info = mesos_pb2.FrameworkInfo( user=getpass.getuser(), name="mysos", checkpoint=False) self._state = Scheduler(framework_info) def tearDown(self): shutil.rmtree(self._tmpdir, True) # Clean up after ourselves. def test_scheduler_recovery(self): scheduler_key = gen_encryption_key() scheduler1 = MysosScheduler( self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", scheduler_key) scheduler1.registered(self._driver, self._framework_id, object()) scheduler1.create_cluster("cluster1", "mysql_user", 3) scheduler1.resourceOffers(self._driver, [self._offer]) # One task is launched for one offer. assert len(scheduler1._launchers["cluster1"]._cluster.tasks) == 1 with pytest.raises(MysosScheduler.ClusterExists): scheduler1.create_cluster("cluster1", "mysql_user", 3) # FrameworkID should have been persisted. self._state = self._state_provider.load_scheduler_state() assert self._state.framework_info.id.value == self._framework_id.value # Simulate restart. scheduler2 = MysosScheduler( self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", scheduler_key) # Scheduler always receives registered() with the same FrameworkID after failover. scheduler2.registered(self._driver, self._framework_id, object()) assert len(scheduler2._launchers) == 1 assert scheduler2._launchers["cluster1"].cluster_name == "cluster1" # Scheduler has recovered the cluster so it doesn't accept another of the same name. with pytest.raises(MysosScheduler.ClusterExists): scheduler2.create_cluster("cluster1", "mysql_user", 3) def test_scheduler_recovery_failure_before_launch(self): scheduler_key = gen_encryption_key() scheduler1 = MysosScheduler( self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", scheduler_key) scheduler1.registered(self._driver, self._framework_id, object()) _, password = scheduler1.create_cluster("cluster1", "mysql_user", 3) # Simulate restart before the task is successfully launched. scheduler2 = MysosScheduler( self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", scheduler_key) assert len(scheduler2._launchers) == 0 # No launchers are recovered. # Scheduler always receives registered() with the same FrameworkID after failover. scheduler2.registered(self._driver, self._framework_id, object()) assert len(scheduler2._launchers) == 1 assert scheduler2._launchers["cluster1"].cluster_name == "cluster1" password_box = PasswordBox(scheduler_key) assert password_box.match( password, scheduler2._launchers["cluster1"]._cluster.encrypted_password) # Now offer the resources for this task. scheduler2.resourceOffers(self._driver, [self._offer]) # One task is launched for the offer. assert len(scheduler2._launchers["cluster1"]._cluster.active_tasks) == 1 # Scheduler has recovered the cluster so it doesn't accept another of the same name. with pytest.raises(MysosScheduler.ClusterExists): scheduler2.create_cluster("cluster1", "mysql_user", 3) def test_incompatible_resource_role(self): scheduler1 = MysosScheduler( self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", gen_encryption_key(), framework_role='mysos') # Require 'mysos' but the resources are in '*'. RootMetrics().register_observable('scheduler', scheduler1) scheduler1.registered(self._driver, self._framework_id, object()) scheduler1.create_cluster("cluster1", "mysql_user", 3) scheduler1.resourceOffers(self._driver, [self._offer]) assert "declineOffer" in self._driver.method_calls assert len(self._driver.method_calls["declineOffer"]) == 1 # [0][0][1]: [First declineOffer call][The positional args][The first positional arg], which is # a 'Filters' object. assert (self._driver.method_calls["declineOffer"][0][0][1].refuse_seconds == INCOMPATIBLE_ROLE_OFFER_REFUSE_DURATION.as_(Time.SECONDS)) sample = RootMetrics().sample() assert sample['scheduler.offers_incompatible_role'] == 1 def test_scheduler_metrics(self): scheduler_key = gen_encryption_key() scheduler = MysosScheduler( self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", scheduler_key) RootMetrics().register_observable('scheduler', scheduler) scheduler.registered(self._driver, self._framework_id, object()) sample = RootMetrics().sample() assert sample['scheduler.framework_registered'] == 1 scheduler.create_cluster( "cluster1", "mysql_user", 3, cluster_password='******') sample = RootMetrics().sample() assert sample['scheduler.cluster_count'] == 1 assert sample['scheduler.total_requested_mem_mb'] == DEFAULT_TASK_MEM.as_(Data.MB) * 3 assert sample['scheduler.total_requested_disk_mb'] == DEFAULT_TASK_DISK.as_(Data.MB) * 3 assert sample['scheduler.total_requested_cpus'] == DEFAULT_TASK_CPUS * 3 scheduler.resourceOffers(self._driver, [self._offer]) sample = RootMetrics().sample() assert sample['scheduler.resource_offers'] == 1 assert sample['scheduler.tasks_launched'] == 1 status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING status.slave_id.value = self._offer.slave_id.value status.task_id.value = 'mysos-cluster1-0' scheduler.statusUpdate(self._driver, status) status.state = mesos_pb2.TASK_FAILED scheduler.statusUpdate(self._driver, status) sample = RootMetrics().sample() assert sample['scheduler.tasks_failed'] == 1 scheduler.delete_cluster("cluster1", 'test_password') sample = RootMetrics().sample() assert sample['scheduler.cluster_count'] == 0 assert sample['scheduler.total_requested_mem_mb'] == 0 assert sample['scheduler.total_requested_disk_mb'] == 0 assert sample['scheduler.total_requested_cpus'] == 0 def test_scheduler_delete_empty_cluster(self): scheduler_key = gen_encryption_key() scheduler = MysosScheduler( self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", scheduler_key) scheduler.registered(self._driver, self._framework_id, object()) _, password = scheduler.create_cluster("cluster1", "mysql_user", 3) assert len(scheduler._launchers) == 1 # Deleting the cluster before any offer comes in for launching any task. scheduler.delete_cluster("cluster1", password) assert len(scheduler._launchers) == 0
class TestTaskRunner(unittest.TestCase): def setUp(self): self._storage = FakeStorage(SequentialThreadingHandler()) self._client = FakeClient(storage=self._storage) self._client.start() self._self_instance = ServiceInstance(Endpoint("host", 10000)) self._state_manager = FakeStateManager() def tearDown(self): self._client.stop() def test_stop(self): task_control = FakeTaskControl() runner = MysosTaskRunner( self._self_instance, self._client, "/home/test/my_cluster", NoopPackageInstaller(), task_control, self._state_manager) runner.start() assert runner.stop() # Killed by SIGTERM. assert deadline(runner.join, Amount(1, Time.SECONDS)) == -signal.SIGTERM def test_demote(self): task_control = FakeTaskControl() runner = MysosTaskRunner( self._self_instance, self._client, "/home/test/my_cluster", NoopPackageInstaller(), task_control, self._state_manager) manager = ClusterManager(self._client, "/home/test/my_cluster") runner.start() self_member = manager.add_member(self._self_instance) # 'self_instance' becomes the master. manager.promote_member(self_member) runner.promoted.wait(1) another_member = manager.add_member(ServiceInstance(Endpoint("another_host", 10000))) # This demotes 'self_instance', which should cause runner to stop. manager.promote_member(another_member) assert deadline(runner.join, Amount(1, Time.SECONDS)) def test_reparent(self): task_control = FakeTaskControl() runner = MysosTaskRunner( self._self_instance, self._client, "/home/test/my_cluster", NoopPackageInstaller(), task_control, self._state_manager) manager = ClusterManager(self._client, "/home/test/my_cluster") runner.start() # Promote another instance. master = ServiceInstance(Endpoint("another_host", 10000)) another_member = manager.add_member(master) manager.promote_member(another_member) assert runner.master.get(True, 1) == master assert runner.stop() assert deadline(runner.join, Amount(1, Time.SECONDS)) def test_mysqld_error(self): task_control = FakeTaskControl(mysqld="exit 123") runner = MysosTaskRunner( self._self_instance, self._client, "/home/test/my_cluster", NoopPackageInstaller(), task_control, self._state_manager) runner.start() assert deadline(runner.join, Amount(1, Time.SECONDS)) == 123 def test_start_command_error(self): task_control = FakeTaskControl(start_cmd="exit 1") runner = MysosTaskRunner( self._self_instance, self._client, "/home/test/my_cluster", NoopPackageInstaller(), task_control, self._state_manager) with pytest.raises(TaskError) as e: runner.start() assert e.value.message.startswith("Failed to start MySQL task") def test_promote_command_error(self): task_control = FakeTaskControl(promote_cmd="exit 1") runner = MysosTaskRunner( self._self_instance, self._client, "/home/test/my_cluster", NoopPackageInstaller(), task_control, self._state_manager) manager = ClusterManager(self._client, "/home/test/my_cluster") runner.start() self_member = manager.add_member(self._self_instance) # 'self_instance' becomes the master. manager.promote_member(self_member) runner.promoted.wait(1) with pytest.raises(TaskError) as e: runner.join() assert e.value.message.startswith("Failed to promote the slave") def test_get_log_position(self): task_control = FakeTaskControl(position=1) runner = MysosTaskRunner( self._self_instance, self._client, "/home/test/my_cluster", NoopPackageInstaller(), task_control, self._state_manager) runner.start() assert runner.get_log_position() == 1 def test_get_log_position_error(self): task_control = FakeTaskControl(get_log_position_cmd="exit 1") runner = MysosTaskRunner( self._self_instance, self._client, "/home/test/my_cluster", NoopPackageInstaller(), task_control, self._state_manager) with pytest.raises(TaskError) as e: runner.get_log_position() assert (e.value.message == "Unable to get the slave's log position: " + "Command 'exit 1' returned non-zero exit status 1") def test_stop_interminable(self): cmd = """trap "echo Trapped SIGTERM!" TERM while : do sleep 60 done """ task_control = FakeTaskControl(mysqld=cmd) runner = MysosTaskRunner( self._self_instance, self._client, "/home/test/my_cluster", NoopPackageInstaller(), task_control, self._state_manager) task_control._mysqld = cmd runner.start() assert runner.stop(timeout=1) assert deadline(runner.join, Amount(1, Time.SECONDS)) == -signal.SIGKILL
def queue(self): client = FakeClient() client.start() yield ZKDelayDeadlineQueue(client, "/")
def setUp(self): self.storage = FakeStorage(SequentialThreadingHandler()) self.client = FakeClient(storage=self.storage) self.client.start()
class TestCluster(unittest.TestCase): def setUp(self): self.storage = FakeStorage(SequentialThreadingHandler()) self.client = FakeClient(storage=self.storage) self.client.start() def tearDown(self): self.client.stop() def test_add_member(self): manager = ClusterManager(self.client, "/home/my_cluster") instance1 = ServiceInstance(Endpoint("host1", 10000)) member1 = manager.add_member(instance1) assert member1 == manager.add_member(instance1) # Second insertion is ignored. instance2 = ServiceInstance(Endpoint("host2", 10000)) manager.add_member(instance2) assert len(manager._cluster.members) == 2 assert self.storage.paths["/home/my_cluster/slaves/member_0000000000"]["data"] == ServiceInstance.pack( instance1 ) assert self.storage.paths["/home/my_cluster/slaves/member_0000000001"]["data"] == ServiceInstance.pack( instance2 ) def test_promote_member(self): manager = ClusterManager(self.client, "/home/my_cluster") instance = ServiceInstance(Endpoint("host", 10000)) member = manager.add_member(instance) assert manager.promote_member(member) assert not manager.promote_member(member) # The 2nd promotion is a no-op. assert self.storage.paths["/home/my_cluster/master/member_0000000000"]["data"] == ServiceInstance.pack(instance) def test_remove_member(self): manager = ClusterManager(self.client, "/home/my_cluster") instance = ServiceInstance(Endpoint("host", 10000)) member = manager.add_member(instance) assert manager.remove_member(member) assert not manager.remove_member(member) # The second deletion is ignored. assert "/home/my_cluster/master/member_0000000000" not in self.storage.paths def test_callbacks(self): manager = ClusterManager(self.client, "/home/my_cluster") # Set up 2 listeners. instance1 = ServiceInstance(Endpoint("host1", 10000)) handler1 = CallbackHandler() listener1 = ClusterListener( self.client, "/home/my_cluster", instance1, handler1.promotion_callback, handler1.demotion_callback, handler1.master_callback, handler1.termination_callback, ) listener1.start() member1 = manager.add_member(instance1) instance2 = ServiceInstance(Endpoint("host2", 10000)) handler2 = CallbackHandler() listener2 = ClusterListener( self.client, "/home/my_cluster", instance2, handler2.promotion_callback, handler2.demotion_callback, handler2.master_callback, ) listener2.start() member2 = manager.add_member(instance2) # Test promotion. manager.promote_member(member1) assert handler1.promoted.wait(1) assert handler2.detected.get(True, 1) == instance1 assert self.storage.paths["/home/my_cluster/master/member_0000000000"]["data"] == ServiceInstance.pack( instance1 ) assert self.storage.paths["/home/my_cluster/slaves/member_0000000001"]["data"] == ServiceInstance.pack( instance2 ) manager.promote_member(member2) assert handler1.demoted.wait(1) assert handler2.promoted.wait(1) assert self.storage.paths["/home/my_cluster/master/member_0000000001"]["data"] == ServiceInstance.pack( instance2 ) assert "/home/my_cluster/master/member_0000000000" not in self.storage.paths manager.remove_member(member2) assert handler2.demoted.wait(1) # Test removing cluster. manager.remove_member(member1) manager.delete_cluster() assert handler1.terminated.wait(1) def test_invalid_arguments(self): client = FakeClient() client.start() manager = ClusterManager(client, "/home/my_cluster") with pytest.raises(ValueError) as e: manager.promote_member("123") assert e.value.message == "Invalid member_id: 123" def test_invalid_znode(self): instance1 = ServiceInstance(Endpoint("host1", 10000)) handler1 = CallbackHandler() listener1 = ClusterListener( self.client, "/home/my_cluster", instance1, handler1.promotion_callback, handler1.demotion_callback, handler1.master_callback, ) listener1.start() self.client.ensure_path("/home/my_cluster/master") self.client.create("/home/my_cluster/master/member_", "Invalid Data", sequence=True) # Invalid ZNode data translates into a 'None' return. assert handler1.detected.get(True, 1) is None def test_existing_zk(self): """ ClusterManager needs to be able to recover from an existing ZK group for scheduler failover. """ manager = ClusterManager(self.client, "/home/my_cluster") instance1 = ServiceInstance(Endpoint("host1", 10000)) member1 = manager.add_member(instance1) instance2 = ServiceInstance(Endpoint("host2", 10000)) member2 = manager.add_member(instance2) assert self.storage.paths["/home/my_cluster/slaves/member_0000000000"]["data"] == ServiceInstance.pack( instance1 ) assert self.storage.paths["/home/my_cluster/slaves/member_0000000001"]["data"] == ServiceInstance.pack( instance2 ) manager.promote_member(member1) # Test the new ClusterManager. manager2 = ClusterManager(self.client, "/home/my_cluster") assert len(manager2._cluster.members) == 2 assert member1 in manager2._cluster.members assert member2 in manager2._cluster.members assert manager2._cluster.members[member1] == ServiceInstance.pack(instance1) def test_remove_cluster(self): manager = ClusterManager(self.client, "/home/my_cluster") instance1 = ServiceInstance(Endpoint("host1", 10000)) member1 = manager.add_member(instance1) instance2 = ServiceInstance(Endpoint("host2", 10000)) member2 = manager.add_member(instance2) manager.promote_member(member1) with pytest.raises(ClusterManager.Error): manager.delete_cluster() manager.remove_member(member1) manager.remove_member(member2) manager.delete_cluster() assert "/home/my_cluster" not in self.storage.paths
def multiple_queues(self): client = FakeClient() client.start() yield [ZKDelayDeadlineQueue(client, "/") for _ in range(5)]
class TestScheduler(unittest.TestCase): def setUp(self): self._driver = FakeDriver() self._storage = FakeStorage(SequentialThreadingHandler()) self._zk_client = FakeClient(storage=self._storage) self._zk_client.start() self._framework_id = mesos_pb2.FrameworkID() self._framework_id.value = "framework_id_0" self._offer = mesos_pb2.Offer() self._offer.id.value = "offer_id_0" self._offer.framework_id.value = self._framework_id.value self._offer.slave_id.value = "slave_id_0" self._offer.hostname = "localhost" resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000, 10001, 10002])) self._offer.resources.extend(resources) self._framework_user = "******" self._zk_url = "zk://host/mysos/test" self._cluster = MySQLCluster("cluster0", "user", "pass", 3) self._tmpdir = tempfile.mkdtemp() self._state_provider = LocalStateProvider(self._tmpdir) framework_info = mesos_pb2.FrameworkInfo(user=getpass.getuser(), name="mysos", checkpoint=False) self._state = Scheduler(framework_info) def tearDown(self): shutil.rmtree(self._tmpdir, True) # Clean up after ourselves. def test_scheduler_recovery(self): scheduler1 = MysosScheduler(self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml") scheduler1.registered(self._driver, self._framework_id, object()) scheduler1.create_cluster("cluster1", "mysql_user", 3) scheduler1.resourceOffers(self._driver, [self._offer]) # One task is launched for one offer. assert len(scheduler1._launchers["cluster1"]._cluster.tasks) == 1 with pytest.raises(MysosScheduler.ClusterExists): scheduler1.create_cluster("cluster1", "mysql_user", 3) # FrameworkID should have been persisted. self._state = self._state_provider.load_scheduler_state() assert self._state.framework_info.id.value == self._framework_id.value # Simulate restart. scheduler2 = MysosScheduler(self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml") # Scheduler always receives registered() with the same FrameworkID after failover. scheduler2.registered(self._driver, self._framework_id, object()) assert len(scheduler2._launchers) == 1 assert scheduler2._launchers["cluster1"].cluster_name == "cluster1" # Scheduler has recovered the cluster so it doesn't accept another of the same name. with pytest.raises(MysosScheduler.ClusterExists): scheduler2.create_cluster("cluster1", "mysql_user", 3) def test_scheduler_recovery_failure_before_launch(self): scheduler1 = MysosScheduler(self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml") scheduler1.registered(self._driver, self._framework_id, object()) scheduler1.create_cluster("cluster1", "mysql_user", 3) # Simulate restart before the task is successfully launched. scheduler2 = MysosScheduler(self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml") assert len(scheduler2._launchers) == 0 # No launchers are recovered. # Scheduler always receives registered() with the same FrameworkID after failover. scheduler2.registered(self._driver, self._framework_id, object()) assert len(scheduler2._launchers) == 1 assert scheduler2._launchers["cluster1"].cluster_name == "cluster1" # Now offer the resources for this task. scheduler2.resourceOffers(self._driver, [self._offer]) # One task is launched for the offer. assert len( scheduler2._launchers["cluster1"]._cluster.active_tasks) == 1 # Scheduler has recovered the cluster so it doesn't accept another of the same name. with pytest.raises(MysosScheduler.ClusterExists): scheduler2.create_cluster("cluster1", "mysql_user", 3) def test_incompatible_resource_role(self): scheduler1 = MysosScheduler( self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", framework_role='mysos' ) # Require 'mysos' but the resources are in '*'. scheduler1.registered(self._driver, self._framework_id, object()) scheduler1.create_cluster("cluster1", "mysql_user", 3) scheduler1.resourceOffers(self._driver, [self._offer]) assert "declineOffer" in self._driver.method_calls assert len(self._driver.method_calls["declineOffer"]) == 1 # [0][0][1]: [First declineOffer call][The positional args][The first positional arg], which is # a 'Filters' object. assert (self._driver.method_calls["declineOffer"][0][0][1]. refuse_seconds == INCOMPATIBLE_ROLE_OFFER_REFUSE_DURATION.as_( Time.SECONDS))
class TestScheduler(unittest.TestCase): def setUp(self): self._driver = FakeDriver() self._storage = FakeStorage(SequentialThreadingHandler()) self._zk_client = FakeClient(storage=self._storage) self._zk_client.start() self._framework_id = mesos_pb2.FrameworkID() self._framework_id.value = "framework_id_0" self._offer = mesos_pb2.Offer() self._offer.id.value = "offer_id_0" self._offer.framework_id.value = self._framework_id.value self._offer.slave_id.value = "slave_id_0" self._offer.hostname = "localhost" resources = create_resources(cpus=4, mem=512 * 3, ports=set([10000, 10001, 10002])) self._offer.resources.extend(resources) self._framework_user = "******" self._zk_url = "zk://host/mysos/test" self._cluster = MySQLCluster("cluster0", "user", "pass", 3) self._tmpdir = tempfile.mkdtemp() self._state_provider = LocalStateProvider(self._tmpdir) framework_info = mesos_pb2.FrameworkInfo( user=getpass.getuser(), name="mysos", checkpoint=False) self._state = Scheduler(framework_info) def tearDown(self): shutil.rmtree(self._tmpdir, True) # Clean up after ourselves. def test_scheduler_recovery(self): scheduler1 = MysosScheduler( self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml") scheduler1.registered(self._driver, self._framework_id, object()) scheduler1.create_cluster("cluster1", "mysql_user", 3) scheduler1.resourceOffers(self._driver, [self._offer]) # One task is launched for one offer. assert len(scheduler1._launchers["cluster1"]._cluster.tasks) == 1 with pytest.raises(MysosScheduler.ClusterExists): scheduler1.create_cluster("cluster1", "mysql_user", 3) # FrameworkID should have been persisted. self._state = self._state_provider.load_scheduler_state() assert self._state.framework_info.id.value == self._framework_id.value # Simulate restart. scheduler2 = MysosScheduler( self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml") # Scheduler always receives registered() with the same FrameworkID after failover. scheduler2.registered(self._driver, self._framework_id, object()) assert len(scheduler2._launchers) == 1 assert scheduler2._launchers["cluster1"].cluster_name == "cluster1" # Scheduler has recovered the cluster so it doesn't accept another of the same name. with pytest.raises(MysosScheduler.ClusterExists): scheduler2.create_cluster("cluster1", "mysql_user", 3) def test_scheduler_recovery_failure_before_launch(self): scheduler1 = MysosScheduler( self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml") scheduler1.registered(self._driver, self._framework_id, object()) scheduler1.create_cluster("cluster1", "mysql_user", 3) # Simulate restart before the task is successfully launched. scheduler2 = MysosScheduler( self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml") assert len(scheduler2._launchers) == 0 # No launchers are recovered. # Scheduler always receives registered() with the same FrameworkID after failover. scheduler2.registered(self._driver, self._framework_id, object()) assert len(scheduler2._launchers) == 1 assert scheduler2._launchers["cluster1"].cluster_name == "cluster1" # Now offer the resources for this task. scheduler2.resourceOffers(self._driver, [self._offer]) # One task is launched for the offer. assert len(scheduler2._launchers["cluster1"]._cluster.active_tasks) == 1 # Scheduler has recovered the cluster so it doesn't accept another of the same name. with pytest.raises(MysosScheduler.ClusterExists): scheduler2.create_cluster("cluster1", "mysql_user", 3) def test_incompatible_resource_role(self): scheduler1 = MysosScheduler( self._state, self._state_provider, self._framework_user, "./executor.pex", "cmd.sh", self._zk_client, self._zk_url, Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", framework_role='mysos') # Require 'mysos' but the resources are in '*'. scheduler1.registered(self._driver, self._framework_id, object()) scheduler1.create_cluster("cluster1", "mysql_user", 3) scheduler1.resourceOffers(self._driver, [self._offer]) assert "declineOffer" in self._driver.method_calls assert len(self._driver.method_calls["declineOffer"]) == 1 # [0][0][1]: [First declineOffer call][The positional args][The first positional arg], which is # a 'Filters' object. assert (self._driver.method_calls["declineOffer"][0][0][1].refuse_seconds == INCOMPATIBLE_ROLE_OFFER_REFUSE_DURATION.as_(Time.SECONDS))
def setUp(self): self._storage = FakeStorage(SequentialThreadingHandler()) self._client = FakeClient(storage=self._storage) self._client.start() self._self_instance = ServiceInstance(Endpoint("host", 10000)) self._state_manager = FakeStateManager()
def test_list_deploy_queue(mock_delay_deadline_queue_class, mock_kazoo_client): mock_request = mock.Mock() settings.system_paasta_config = mock.create_autospec(SystemPaastaConfig) mock_kazoo_client.return_value = FakeClient() available_service_instance = ServiceInstance( service="fake_service1", instance="fake_instance1", watcher="worker0", bounce_by=1577952000, wait_until=1577952000, enqueue_time=1577952000, bounce_start_time=1577952000, failures=1, processed_count=2, ) unavailable_service_instance = ServiceInstance( service="fake_service2", instance="fake_instance2", watcher="worker1", bounce_by=1577952100, wait_until=1577952200, enqueue_time=1577952100, bounce_start_time=1577952100, failures=2, processed_count=3, ) mock_delay_deadline_queue = mock_delay_deadline_queue_class.return_value mock_delay_deadline_queue.get_available_service_instances.return_value = [ (mock.Mock(), available_service_instance) ] mock_delay_deadline_queue.get_unavailable_service_instances.return_value = [ (mock.Mock(), mock.Mock(), unavailable_service_instance) ] output = deploy_queue.list_deploy_queue(mock_request) assert output == { "available_service_instances": [{ "service": "fake_service1", "instance": "fake_instance1", "watcher": "worker0", "bounce_by": 1577952000, "wait_until": 1577952000, "enqueue_time": 1577952000, "bounce_start_time": 1577952000, "failures": 1, "processed_count": 2, }], "unavailable_service_instances": [{ "service": "fake_service2", "instance": "fake_instance2", "watcher": "worker1", "bounce_by": 1577952100, "wait_until": 1577952200, "enqueue_time": 1577952100, "bounce_start_time": 1577952100, "failures": 2, "processed_count": 3, }], }
class TestLauncher(object): @pytest.fixture(params=[LocalStateProvider, ZooKeeperStateProvider], autouse=True) def setup(self, request): self._driver = FakeDriver() self._storage = FakeStorage(SequentialThreadingHandler()) self._zk_client = FakeClient(storage=self._storage) self._zk_client.start() self._offer = mesos_pb2.Offer() self._offer.id.value = "offer_id_0" self._offer.framework_id.value = "framework_id_0" self._offer.slave_id.value = "slave_id_0" self._offer.hostname = "localhost" # Enough resources to fit three tasks. resources = create_resources( cpus=DEFAULT_TASK_CPUS * 3, mem=DEFAULT_TASK_MEM * 3, disk=DEFAULT_TASK_DISK * 3, ports=set([10000, 10001, 10002])) self._offer.resources.extend(resources) self._framework_user = "******" # Some tests use the default launcher; some don't. self._zk_url = "zk://host/mysos/test" self._scheduler_key = gen_encryption_key() self._password_box = PasswordBox(self._scheduler_key) self._cluster = MySQLCluster( "cluster0", "user", self._password_box.encrypt("pass"), 3, DEFAULT_TASK_CPUS, DEFAULT_TASK_MEM, DEFAULT_TASK_DISK) # Construct the state provider based on the test parameter. if request.param == LocalStateProvider: tmpdir = tempfile.mkdtemp() self._state_provider = LocalStateProvider(tmpdir) request.addfinalizer(lambda: shutil.rmtree(tmpdir, True)) # Clean up after ourselves. elif request.param == ZooKeeperStateProvider: self._state_provider = ZooKeeperStateProvider(self._zk_client, "/mysos/test") self._launcher = MySQLClusterLauncher( self._driver, self._cluster, self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", self._scheduler_key, query_interval=Amount(150, Time.MILLISECONDS)) # Short interval. self._elected = threading.Event() self._launchers = [self._launcher] # See teardown(). request.addfinalizer(self.teardown) def teardown(self): for launcher in self._launchers: if launcher._elector: launcher._elector.abort() # Abort the thread even if the election is pending. launcher._elector.join() def test_launch_cluster_all_nodes_successful(self): for i in range(self._cluster.num_nodes): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes # No new tasks are launched. assert self._launcher.launch(self._offer)[0] is None assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes # All 3 nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING # Valid state. status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # Two slaves. assert len([x for x in self._storage.paths.keys() if x.startswith( "/mysos/test/cluster0/slaves/member_")]) == 2 def test_launch_cluster_insufficient_resources(self): """All but one slave in the slave are launched successfully.""" del self._offer.resources[:] resources = create_resources( cpus=DEFAULT_TASK_CPUS * 3, mem=DEFAULT_TASK_MEM * 3, disk=DEFAULT_TASK_DISK * 3 - Amount(1, Data.MB), # 1mb less than required disk space. ports=set([10000, 10001, 10002])) self._offer.resources.extend(resources) # There is one fewer port than required to launch the entire cluster. for i in range(self._cluster.num_nodes - 1): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes - 1 # The final task cannot get launched. assert self._launcher.launch(self._offer)[0] is None assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes - 1 # The two nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING # Valid state. status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes - 1): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # One slave. assert len([x for x in self._storage.paths.keys() if x.startswith( "/mysos/test/cluster0/slaves/member_")]) == 1 def test_two_launchers(self): """Two launchers share resources and launch their clusters successfully.""" launchers = [ MySQLClusterLauncher( self._driver, MySQLCluster( "cluster0", "user0", self._password_box.encrypt("pass0"), 1, DEFAULT_TASK_CPUS, DEFAULT_TASK_MEM, DEFAULT_TASK_DISK), self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", self._scheduler_key), MySQLClusterLauncher( self._driver, MySQLCluster( "cluster1", "user1", self._password_box.encrypt("pass1"), 2, DEFAULT_TASK_CPUS, DEFAULT_TASK_MEM, DEFAULT_TASK_DISK), self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", self._scheduler_key)] self._launchers.extend(launchers) resources = create_resources( cpus=DEFAULT_TASK_CPUS * 3, mem=DEFAULT_TASK_MEM * 3, disk=DEFAULT_TASK_DISK * 3, ports=set([10000, 10001, 10002])) self._offer.resources.extend(resources) # Three nodes in total across two clusters. # Simulate the scheduler. for i in range(3): for launcher in launchers: task_id, remaining = launcher.launch(self._offer) if task_id: # Update the offer so other launchers will use its remaining resources. del self._offer.resources[:] self._offer.resources.extend(remaining) break tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == 3 def test_invalid_status_update(self): """Launcher raises an exception when an invalid status is received.""" self._cluster.num_nodes = 1 launcher = MySQLClusterLauncher( self._driver, self._cluster, self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", self._scheduler_key) self._launchers.append(launcher) resources = create_resources( cpus=DEFAULT_TASK_CPUS, mem=DEFAULT_TASK_MEM, disk=DEFAULT_TASK_DISK, ports=set([10000])) self._offer.resources.extend(resources) task_id, _ = launcher.launch(self._offer) assert task_id == "mysos-cluster0-0" tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes status = mesos_pb2.TaskStatus() status.task_id.value = task_id status.state = mesos_pb2.TASK_RUNNING # Valid state. launcher.status_update(status) status.state = mesos_pb2.TASK_FINISHED # An invalid state. with pytest.raises(MySQLClusterLauncher.Error): launcher.status_update(status) def test_terminal_status_update(self): """Launcher reacts to terminated task by launching a new one.""" self._cluster.num_nodes = 1 launcher = MySQLClusterLauncher( self._driver, self._cluster, self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(1, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", self._scheduler_key) self._launchers.append(launcher) resources = create_resources( cpus=DEFAULT_TASK_CPUS, mem=DEFAULT_TASK_MEM, disk=DEFAULT_TASK_DISK, ports=set([10000])) self._offer.resources.extend(resources) task_id, _ = launcher.launch(self._offer) assert task_id == "mysos-cluster0-0" launched = self._driver.method_calls["launchTasks"] assert len(launched) == self._cluster.num_nodes status = mesos_pb2.TaskStatus() status.task_id.value = task_id status.state = mesos_pb2.TASK_RUNNING launcher.status_update(status) assert len(launcher._cluster.running_tasks) == 1 status.state = mesos_pb2.TASK_LOST launcher.status_update(status) assert len(launcher._cluster.running_tasks) == 0 task_id, _ = launcher.launch(self._offer) assert task_id == "mysos-cluster0-1" launched = self._driver.method_calls["launchTasks"] # One task is relaunched to make up for the lost one. assert len(launched) == self._cluster.num_nodes + 1 def test_master_failover(self): for i in range(self._cluster.num_nodes): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes # All 3 nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) # No log positions queries are sent for the first epoch. assert "sendFrameworkMessage" not in self._driver.method_calls # Wait for the election to complete. deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # Now fail the master task. status.task_id.value = "mysos-cluster0-0" status.state = mesos_pb2.TASK_FAILED self._launcher.status_update(status) assert len(self._launcher._cluster.running_tasks) == 2 # Log positions queries are sent. self._launcher._elector._elect() assert len(self._driver.method_calls["sendFrameworkMessage"]) >= 2 for i in range(1, self._cluster.num_nodes): self._launcher.framework_message( "mysos-cluster0-%s" % i, self._offer.slave_id.value, json.dumps(dict(epoch=1, position=str(i)))) # Wait for the election to complete. deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The slave with the highest position is elected. assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths assert len(self._launcher._cluster.running_tasks) == 2 # When a new offer comes in, a new task is launched. del self._offer.resources[:] resources = create_resources( cpus=DEFAULT_TASK_CPUS, mem=DEFAULT_TASK_MEM, disk=DEFAULT_TASK_DISK, ports=set([10000])) self._offer.resources.extend(resources) task_id, _ = self._launcher.launch(self._offer) assert task_id == "mysos-cluster0-3" launched = self._driver.method_calls["launchTasks"] # One task is relaunched to make up for the failed one. assert len(launched) == self._cluster.num_nodes + 1 def test_launcher_recovery_after_election_completed(self): # 1. Launch a cluster on the running launcher. for i in range(self._cluster.num_nodes): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes # No new tasks are launched. assert self._launcher.launch(self._offer)[0] is None assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes # All 3 nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # Two slaves. assert len([x for x in self._storage.paths.keys() if x.startswith( "/mysos/test/cluster0/slaves/member_")]) == 2 # 2. Recover the launcher. self._cluster = self._state_provider.load_cluster_state(self._cluster.name) self._launcher = MySQLClusterLauncher( self._driver, self._cluster, self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", self._scheduler_key, query_interval=Amount(150, Time.MILLISECONDS)) # Now fail the master task. status.task_id.value = "mysos-cluster0-0" status.state = mesos_pb2.TASK_FAILED self._launcher.status_update(status) for i in range(1, self._cluster.num_nodes): self._launcher.framework_message( "mysos-cluster0-%s" % i, self._offer.slave_id.value, json.dumps(dict(epoch=1, position=str(i)))) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The second slave has the larger position and is elected. assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths def test_launcher_recovery_before_election_completed(self): # 1. Launch a cluster on the running launcher. for i in range(self._cluster.num_nodes): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes # No new tasks are launched. assert self._launcher.launch(self._offer)[0] is None assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes # All 3 nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # Two slaves. assert len([x for x in self._storage.paths.keys() if x.startswith( "/mysos/test/cluster0/slaves/member_")]) == 2 # Now fail the master task which leads to re-election. status.task_id.value = "mysos-cluster0-0" status.state = mesos_pb2.TASK_FAILED self._launcher.status_update(status) # 2. Recover the launcher. self._cluster = self._state_provider.load_cluster_state(self._cluster.name) self._launcher = MySQLClusterLauncher( self._driver, self._cluster, self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", self._scheduler_key, query_interval=Amount(150, Time.MILLISECONDS)) for i in range(1, self._cluster.num_nodes): self._launcher.framework_message( "mysos-cluster0-%s" % i, self._offer.slave_id.value, json.dumps(dict(epoch=2, position=str(i)))) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The second slave has the larger position and is elected. assert "/mysos/test/cluster0/master/member_0000000002" in self._storage.paths def test_launcher_kill(self): for i in range(self._cluster.num_nodes): task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-%s" % i tasks = self._driver.method_calls["launchTasks"] assert len(tasks) == self._cluster.num_nodes # No new tasks are launched. assert self._launcher.launch(self._offer)[0] is None assert len(self._driver.method_calls["launchTasks"]) == self._cluster.num_nodes # All 3 nodes have successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING # Valid state. status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) deadline( lambda: wait_for_master( get_cluster_path(self._zk_url, self._cluster.name), self._zk_client), Amount(5, Time.SECONDS)) # The first slave is elected. assert "/mysos/test/cluster0/master/member_0000000000" in self._storage.paths # Two slaves. assert len([x for x in self._storage.paths.keys() if x.startswith( "/mysos/test/cluster0/slaves/member_")]) == 2 # Kill the cluster. with pytest.raises(MySQLClusterLauncher.PermissionError): self._launcher.kill("wrong_password") # Correct password. self._launcher.kill(self._password_box.decrypt(self._cluster.encrypted_password)) # All 3 nodes are successfully killed. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_KILLED status.slave_id.value = self._offer.slave_id.value for i in range(self._cluster.num_nodes): status.task_id.value = "mysos-cluster0-%s" % i self._launcher.status_update(status) assert "/mysos/test/cluster0" not in self._storage.paths # ServerSets removed. assert not self._state_provider.load_cluster_state("cluster0") # State removed. def test_launcher_recovery_corrupted_password(self): # 1. Launch a single instance for a cluster on the running launcher. task_id, remaining = self._launcher.launch(self._offer) del self._offer.resources[:] self._offer.resources.extend(remaining) assert task_id == "mysos-cluster0-0" # The task has successfully started. status = mesos_pb2.TaskStatus() status.state = mesos_pb2.TASK_RUNNING status.slave_id.value = self._offer.slave_id.value status.task_id.value = "mysos-cluster0-0" self._launcher.status_update(status) # 2. Recover the launcher. self._cluster = self._state_provider.load_cluster_state(self._cluster.name) self._cluster.encrypted_password = "******" # The corrupted password causes the launcher constructor to fail. with pytest.raises(ValueError): self._launcher = MySQLClusterLauncher( self._driver, self._cluster, self._state_provider, self._zk_url, self._zk_client, self._framework_user, "./executor.pex", "cmd.sh", Amount(5, Time.SECONDS), "/etc/mysos/admin_keyfile.yml", self._scheduler_key, query_interval=Amount(150, Time.MILLISECONDS))
def setUp(self): self._storage = FakeStorage(SequentialThreadingHandler()) self._client = FakeClient(storage=self._storage) self._client.start() self._state_provider = ZooKeeperStateProvider(self._client, '/mysos')
def panoptes_mock_kazoo_client(**kwargs): return FakeClient()