def test_builds_with_pagination_request( self, offset: Optional[int], limit: Optional[int], expected_first_build_id: int, expected_last_build_id: int, ): master = ClusterMaster() # Create 20 mock builds with ids 1 to 20 for build_id in range(1, self._NUM_BUILDS + 1): build_mock = Mock(spec=Build) build_mock.build_id = build_id BuildStore._all_builds_by_id[build_id] = build_mock requested_builds = master.get_builds(offset, limit) id_of_first_build = requested_builds[0].build_id if len( requested_builds) else None id_of_last_build = requested_builds[-1].build_id if len( requested_builds) else None num_builds = len(requested_builds) self.assertEqual(id_of_first_build, expected_first_build_id, 'Received the wrong first build from request') self.assertEqual(id_of_last_build, expected_last_build_id, 'Received the wrong last build from request') if offset is not None and limit is not None: self.assertLessEqual(num_builds, self._PAGINATION_MAX_LIMIT, 'Received too many builds from request')
def test_connect_slave_with_existing_dead_slave_creates_new_alive_instance( self): master = ClusterMaster() slave_registry = SlaveRegistry.singleton() master.connect_slave('existing-slave.turtles.gov', 10) existing_slave = slave_registry.get_slave( slave_id=None, slave_url='existing-slave.turtles.gov') existing_slave.set_is_alive(False) existing_slave_id = existing_slave.id connect_response = master.connect_slave('existing-slave.turtles.gov', 10) new_slave = slave_registry.get_slave( slave_url='existing-slave.turtles.gov') self.assertNotEqual( str(existing_slave_id), connect_response['slave_id'], 'The re-connected slave should have generated a new slave id.') self.assertTrue( new_slave.is_alive(use_cached=True), 'The new slave should have been marked as alive once instantiated.' ) self.assertEquals( 2, self.mock_slave_allocator.add_idle_slave.call_count, 'Expected slave to be added to the idle slaves list.')
def test_get_slave_raises_exception_on_invalid_arguments( self, get_slave_kwargs): master = ClusterMaster() master.connect_slave('raphael.turtles.gov', 10) with self.assertRaises(ValueError): master.get_slave(**get_slave_kwargs)
def test_handle_result_reported_from_slave_when_build_is_canceled(self): build_id = 1 slave_url = "url" build = Build(BuildRequest({})) self.patch('app.master.build.util') build.generate_project_type() build.cancel() self.patch_object(build, '_handle_subjob_payload') self.patch_object(build, '_mark_subjob_complete') master = ClusterMaster() slave_registry = SlaveRegistry.singleton() BuildStore._all_builds_by_id[build_id] = build slave_registry._all_slaves_by_url[slave_url] = Mock() mock_scheduler = self.mock_scheduler_pool.get(build) master.handle_result_reported_from_slave(slave_url, build_id, 1) self.assertEqual(build._handle_subjob_payload.call_count, 1, "Canceled builds should " "handle payload") self.assertEqual( build._mark_subjob_complete.call_count, 1, "Canceled builds should mark " "their subjobs complete") self.assertTrue( mock_scheduler.execute_next_subjob_or_free_executor.called)
def test_builds_with_pagination_request( self, offset: Optional[int], limit: Optional[int], expected_first_build_id: int, expected_last_build_id: int, ): master = ClusterMaster() # Create 20 mock builds with ids 1 to 20 for build_id in range(1, self._NUM_BUILDS + 1): build_mock = Mock(spec=Build) build_mock.build_id = build_id BuildStore._cached_builds_by_id[build_id] = build_mock # Normally `get_builds` counts the amount of builds in database, but since we're directly # adding builds into the cache here, we want to count those instead. self.patch('app.database.build_store.BuildStore.count_all_builds', autospec=False).return_value = len( BuildStore._cached_builds_by_id) requested_builds = master.get_builds(offset, limit) id_of_first_build = requested_builds[0].build_id if len( requested_builds) else None id_of_last_build = requested_builds[-1].build_id if len( requested_builds) else None num_builds = len(requested_builds) self.assertEqual(id_of_first_build, expected_first_build_id, 'Received the wrong first build from request') self.assertEqual(id_of_last_build, expected_last_build_id, 'Received the wrong last build from request') if offset is not None and limit is not None: self.assertLessEqual(num_builds, self._PAGINATION_MAX_LIMIT, 'Received too many builds from request')
def test_updating_slave_to_nonexistent_state_should_raise_bad_request_error(self): master = ClusterMaster() slave_url = 'raphael.turtles.gov' master.connect_slave(slave_url, 10) slave = master.get_slave(slave_url=slave_url) with self.assertRaises(BadRequestError): master.handle_slave_state_update(slave, 'NONEXISTENT_STATE')
def test_get_slave_raises_exception_on_slave_not_found(self, get_slave_kwargs): master = ClusterMaster() master.connect_slave('raphael.turtles.gov', 10) master.connect_slave('leonardo.turtles.gov', 10) master.connect_slave('donatello.turtles.gov', 10) with self.assertRaises(ItemNotFoundError): master.get_slave(**get_slave_kwargs)
def test_connect_slave_adds_new_slave_if_slave_never_connected_before(self): master = ClusterMaster() master.connect_slave('never-before-seen.turtles.gov', 10) self.assertEqual(1, len(master.all_slaves_by_id()), 'Exactly one slave should be registered with the master.') self.assertIsNotNone(master.get_slave(slave_id=None, slave_url='never-before-seen.turtles.gov'), 'Registered slave does not have the expected url.')
def test_update_slave_last_heartbeat_time_calls_correspondig_slave_method(self, slave_alive, method_call_count): master = ClusterMaster() mock_slave = self.patch('app.master.cluster_master.Slave').return_value mock_slave.is_alive.return_value = slave_alive master.update_slave_last_heartbeat_time(mock_slave) self.assertEqual(mock_slave.update_last_heartbeat_time.call_count, method_call_count, 'last heartbeat time is updated for the target slave')
def test_updating_slave_to_disconnected_state_should_reset_slave_current_build_id(self): master = ClusterMaster() slave_url = 'raphael.turtles.gov' master.connect_slave(slave_url, num_executors=10) slave = master.get_slave(slave_url=slave_url) slave.current_build_id = 4 master.handle_slave_state_update(slave, SlaveState.DISCONNECTED) self.assertIsNone(slave.current_build_id)
def test_updating_slave_to_shutdown_should_call_slave_set_shutdown_mode(self): master = ClusterMaster() slave_url = 'raphael.turtles.gov' master.connect_slave(slave_url, 10) slave = master.get_slave(slave_url=slave_url) slave.set_shutdown_mode = Mock() master.handle_slave_state_update(slave, SlaveState.SHUTDOWN) slave.set_shutdown_mode.assert_called_once_with()
def test_update_slave_last_heartbeat_time_calls_update_last_heartbeat_time_on_slave( self): master = ClusterMaster() mock_slave = self.patch('app.master.cluster_master.Slave').return_value master.update_slave_last_heartbeat_time(mock_slave) self.assertEqual( mock_slave.update_last_heartbeat_time.call_count, 1, 'last heartbeat time is updated for the target slave')
def test_updating_slave_to_disconnected_state_should_mark_slave_as_dead(self): master = ClusterMaster() slave_url = 'raphael.turtles.gov' master.connect_slave(slave_url, num_executors=10) slave = master.get_slave(slave_url=slave_url) self.assertTrue(slave.is_alive()) master.handle_slave_state_update(slave, SlaveState.DISCONNECTED) self.assertFalse(slave.is_alive())
def test_connect_slave_with_existing_slave_running_build_cancels_build(self): master = ClusterMaster() master.connect_slave('running-slave.turtles.gov', 10) build_mock = MagicMock(spec_set=Build) master._all_builds_by_id[1] = build_mock existing_slave = master.get_slave(slave_id=None, slave_url='running-slave.turtles.gov') existing_slave.current_build_id = 1 master.connect_slave('running-slave.turtles.gov', 10) self.assertTrue(build_mock.cancel.called, 'The build was not cancelled.')
def test_connect_slave_with_existing_dead_slave_removes_old_slave_entry_from_registry(self): master = ClusterMaster() slave_registry = SlaveRegistry.singleton() master.connect_slave('existing-slave.turtles.gov', 10) old_existing_slave = slave_registry.get_slave(slave_id=None, slave_url='existing-slave.turtles.gov') old_existing_slave_id = old_existing_slave.id connect_response = master.connect_slave('existing-slave.turtles.gov', 10) with self.assertRaises(ItemNotFoundError): slave_registry.get_slave(slave_id=old_existing_slave_id)
def test_update_build_with_bad_build_id_fails(self): build_id = 1 invalid_build_id = 2 update_params = {'key': 'value'} master = ClusterMaster() build = Mock() master._all_builds_by_id[build_id] = build build.validate_update_params = Mock(return_value=(True, update_params)) build.update_state = Mock() with self.assertRaises(ItemNotFoundError): master.handle_request_to_update_build(invalid_build_id, update_params)
def test_get_slave_returns_expected_value_given_valid_arguments(self): master = ClusterMaster() master.connect_slave('raphael.turtles.gov', 10) master.connect_slave('leonardo.turtles.gov', 10) master.connect_slave('donatello.turtles.gov', 10) actual_slave_by_id = master.get_slave(slave_id=2) actual_slave_by_url = master.get_slave(slave_url='leonardo.turtles.gov') self.assertEqual(2, actual_slave_by_id.id, 'Retrieved slave should have the same id as requested.') self.assertEqual('leonardo.turtles.gov', actual_slave_by_url.url, 'Retrieved slave should have the same url as requested.')
def test_updating_slave_to_setup_completed_state_should_tell_build_to_begin_subjob_execution(self): master = ClusterMaster() fake_build = MagicMock(spec_set=Build) master.get_build = MagicMock(return_value=fake_build) slave_url = 'raphael.turtles.gov' master.connect_slave(slave_url, 10) slave = master.get_slave(slave_url=slave_url) mock_scheduler = self.mock_scheduler_pool.get(fake_build) master.handle_slave_state_update(slave, SlaveState.SETUP_COMPLETED) mock_scheduler.begin_subjob_executions_on_slave.assert_called_once_with(slave)
def test_exception_raised_during_complete_subjob_does_not_prevent_slave_teardown(self): slave_url = 'raphael.turtles.gov' mock_build = Mock(spec_set=Build, build_id=lambda: 777, is_finished=False) mock_build.complete_subjob.side_effect = [RuntimeError('Write failed')] master = ClusterMaster() master._all_builds_by_id[mock_build.build_id()] = mock_build master._all_slaves_by_url[slave_url] = Mock() mock_scheduler = self.mock_scheduler_pool.get(mock_build) with self.assertRaisesRegex(RuntimeError, 'Write failed'): master.handle_result_reported_from_slave(slave_url, mock_build.build_id(), subjob_id=888) self.assertEqual(mock_scheduler.execute_next_subjob_or_free_executor.call_count, 1)
def test_update_build_with_valid_params_succeeds(self): build_id = 1 update_params = {'key': 'value'} master = ClusterMaster() build = Mock() master._all_builds_by_id[build_id] = build build.validate_update_params = Mock(return_value=(True, update_params)) build.update_state = Mock() success, response = master.handle_request_to_update_build(build_id, update_params) build.update_state.assert_called_once_with(update_params) self.assertTrue(success, "Update build should return success") self.assertEqual(response, {}, "Response should be empty")
def test_updating_slave_to_idle_state_does_not_mark_build_finished_when_slaves_not_done( self): master = ClusterMaster() slave1 = Slave('', 1) slave2 = Slave('', 1) slave3 = Slave('', 1) slave1.current_build_id = 1 slave2.current_build_id = None slave3.current_build_id = 1 build1 = Build(BuildRequest({})) master._all_slaves_by_url = {'1': slave1, '2': slave2, '3': slave3} master._all_builds_by_id = {1: build1} build1._build_id = 1 build1.finish = MagicMock() master.handle_slave_state_update(slave1, SlaveState.IDLE) self.assertFalse(build1.finish.called)
def test_heartbeat_disconnects_unresponsive_slave(self, slave_alive, seconds_since_last_heartbeat): last_heartbeat_time = self._mock_current_datetime - timedelta(seconds=seconds_since_last_heartbeat) master = ClusterMaster() mock_slave = Mock() self.patch('app.master.cluster_master.Slave', new=lambda *args: mock_slave) master.connect_slave('slave_url', 1) mock_slave.is_alive.return_value = slave_alive mock_slave.get_last_heartbeat_time.return_value = last_heartbeat_time master._disconnect_non_heartbeating_slaves() if slave_alive and seconds_since_last_heartbeat == 1000: self.assertEqual(mock_slave.mark_dead.call_count, 1, 'master disconnects unresponsive slave') else: self.assertEqual(mock_slave.mark_dead.call_count, 0, 'master should not disconnect a dead or responsive slave')
def test_updating_slave_to_setup_completed_state_should_tell_build_to_begin_subjob_execution(self): master = ClusterMaster() slave_registry = SlaveRegistry.singleton() fake_build = MagicMock(spec_set=Build) master.get_build = MagicMock(return_value=fake_build) slave_url = 'raphael.turtles.gov' master.connect_slave(slave_url, 10) slave = slave_registry.get_slave(slave_url=slave_url) mock_scheduler = self.mock_scheduler_pool.get(fake_build) scheduler_begin_event = Event() mock_scheduler.begin_subjob_executions_on_slave.side_effect = lambda **_: scheduler_begin_event.set() master.handle_slave_state_update(slave, SlaveState.SETUP_COMPLETED) was_called = scheduler_begin_event.wait(timeout=5) self.assertTrue(was_called, 'scheduler.begin_subjob_executions_on_slave should be called in response ' 'to slave setup completing.') _, call_kwargs = mock_scheduler.begin_subjob_executions_on_slave.call_args self.assertEqual(call_kwargs.get('slave'), slave)
def test_handle_result_reported_from_slave_does_nothing_when_build_is_canceled(self): build_id = 1 slave_url = "url" build = Build(BuildRequest({})) build._is_canceled = True self.patch_object(build, '_handle_subjob_payload') self.patch_object(build, '_mark_subjob_complete') master = ClusterMaster() master._all_builds_by_id[build_id] = build master._all_slaves_by_url[slave_url] = Mock() mock_scheduler = self.mock_scheduler_pool.get(build) master.handle_result_reported_from_slave(slave_url, build_id, 1) self.assertEqual(build._handle_subjob_payload.call_count, 0, "Build is canceled, should not handle payload") self.assertEqual(build._mark_subjob_complete.call_count, 0, "Build is canceled, should not complete subjobs") self.assertEqual(mock_scheduler.execute_next_subjob_or_free_executor.call_count, 0, "Build is canceled, should not do next subjob")
def test_handle_result_reported_from_slave_does_nothing_when_build_is_canceled( self): build_id = 1 slave_url = "url" build = Build(BuildRequest({})) build.handle_subjob_payload = Mock() build.mark_subjob_complete = Mock() build.execute_next_subjob_on_slave = Mock() master = ClusterMaster() master._all_builds_by_id[build_id] = build master._all_slaves_by_url[slave_url] = Mock() build._is_canceled = True master.handle_result_reported_from_slave(slave_url, build_id, 1) self.assertEqual(build.handle_subjob_payload.call_count, 0, "Build is canceled, should not handle payload") self.assertEqual(build.mark_subjob_complete.call_count, 0, "Build is canceled, should not complete subjobs") self.assertEqual(build.execute_next_subjob_on_slave.call_count, 0, "Build is canceled, should not do next subjob")
def async_run(self, port, log_level, eventlog_file): """ Run a ClusterRunner master service. :param port: the port on which to run the slave service :type port: int | None :param log_level: the log level at which to do application logging (or None for default log level) :type log_level: str | None :param eventlog_file: an optional alternate file in which to write event logs :type eventlog_file: str | None """ port = port or Configuration['port'] log_level = log_level or Configuration['log_level'] eventlog_file = eventlog_file or Configuration['eventlog_file'] log.configure_logging(log_level=log_level, log_file=Configuration['log_file']) analytics.initialize(eventlog_file) analytics.record_event(analytics.SERVICE_STARTED, service='master') cluster_master = ClusterMaster() application = ClusterMasterApplication(cluster_master) ioloop = self._start_application(application, port) self._write_pid_file(Configuration['master_pid_file']) # log startup message once ioloop is running hostname = Configuration['hostname'] log_startup = functools.partial( self._logger.info, 'Master service is running on {}:{}.'.format(hostname, port)) ioloop.add_callback(log_startup) ioloop.start() # this call blocks until the server is stopped ioloop.close( all_fds=True ) # all_fds=True is necessary here to make sure connections don't hang self._logger.notice('Master server was stopped.')
def test_handle_request_to_update_build_does_not_raise_exception(self, build_id, update_params): master = ClusterMaster() master._all_builds_by_id = {build_id: Build({})} master.handle_request_to_update_build(build_id, update_params)
def test_handle_request_for_new_build_does_not_raise_exception(self, build_params): master = ClusterMaster() master.handle_request_for_new_build(build_params)