def setUp(self): super().setUp() self.patch('app.util.fs.create_dir') self.patch('app.util.fs.async_delete') self.patch('os.makedirs') self.mock_slave_allocator = self.patch( 'app.master.cluster_master.SlaveAllocator').return_value self.mock_scheduler_pool = self.patch( 'app.master.cluster_master.BuildSchedulerPool').return_value # mock datetime class inside cluster master self._mock_current_datetime = datetime(2018, 4, 1) self._mock_datetime = self.patch('app.master.cluster_master.datetime') self._mock_datetime.now.return_value = self._mock_current_datetime # Two threads are ran everytime we start up the ClusterMaster. We redirect the calls to # `ThreadPoolExecutor.submit` through a mock proxy so we can capture events. self.thread_pool_executor = ThreadPoolExecutor(max_workers=2) self._thread_pool_executor_cls = self.patch( 'app.master.cluster_master.ThreadPoolExecutor') self._thread_pool_executor_cls.return_value.submit.side_effect = \ self.thread_pool_executor.submit SlaveRegistry.reset_singleton() Configuration['pagination_offset'] = self._PAGINATION_OFFSET Configuration['pagination_limit'] = self._PAGINATION_LIMIT Configuration['pagination_max_limit'] = self._PAGINATION_MAX_LIMIT
def setUp(self): # Configure logging to go to stdout. This makes debugging easier by allowing us to see logs for failed tests. log.configure_logging('DEBUG') self._reset_config() Secret.set('testsecret') SlaveRegistry.reset_singleton() self.cluster = FunctionalTestCluster(verbose=self._get_test_verbosity()) self._network = Network()
def setUp(self): super().setUp() self.addCleanup(patch.stopall) self._patched_items = {} self._blacklist_methods_not_allowed_in_unit_tests() # Stub out a few library dependencies that launch subprocesses. self.patch( 'app.util.autoversioning.get_version').return_value = '0.0.0' self.patch('app.util.conf.base_config_loader.platform.node' ).return_value = self._fake_hostname if self._do_network_mocks: # requests.Session() also makes some subprocess calls on instantiation. self.patch('app.util.network.requests.Session') # Stub out Network.are_hosts_same() call with a simple string comparison. self.patch('app.util.network.Network.are_hosts_same', new=lambda host_a, host_b: host_a == host_b) # Reset singletons so that they get recreated for every test that uses them. Configuration.reset_singleton() UnhandledExceptionHandler.reset_singleton() SlaveRegistry.reset_singleton() # Explicitly initialize UnhandledExceptionHandler singleton here (on the main thread) since it sets up signal # handlers that must execute on the main thread. UnhandledExceptionHandler.singleton() MasterConfigLoader().configure_defaults(Configuration.singleton()) MasterConfigLoader().configure_postload(Configuration.singleton()) self.patch( 'app.util.conf.master_config_loader.MasterConfigLoader.load_from_config_file' ) # Reset counters Slave._slave_id_counter = Counter() Build._build_id_counter = Counter() analytics._event_id_generator = Counter() # Configure logging to go to stdout. This makes debugging easier by allowing us to see logs for failed tests. log.configure_logging('DEBUG') # Then stub out configure_logging so we don't end up logging to real files during testing. self.patch('app.util.log.configure_logging') # Set up TestHandler. This allows asserting on log messages in tests. self.log_handler = logbook.TestHandler(bubble=True) self.log_handler.push_application() self._base_setup_called = True
def test_slave_reconnection_does_not_take_down_master(self): SlaveRegistry.reset_singleton() test_config = JOB_WITH_SETUP_AND_TEARDOWN job_config = yaml.safe_load( test_config.config[os.name])['JobWithSetupAndTeardown'] master = self.cluster.start_master() # Start a slave, hard kill it, then reconnect it. self.cluster.start_slave(num_executors_per_slave=5, start_port=43001) self.cluster.kill_slaves(kill_gracefully=False) # Make sure the slave restarts with the same port. slave = self.cluster.start_slave(num_executors_per_slave=5, start_port=43001) # Start two builds. project_dir_1 = tempfile.TemporaryDirectory() build_1 = master.post_new_build({ 'type': 'directory', 'config': job_config, 'project_directory': project_dir_1.name, }) project_dir_2 = tempfile.TemporaryDirectory() build_2 = master.post_new_build({ 'type': 'directory', 'config': job_config, 'project_directory': project_dir_2.name, }) self.assertTrue( master.block_until_build_finished(build_1['build_id'], timeout=45), 'Build 1 should finish building within the timeout.') self.assertTrue( master.block_until_build_finished(build_2['build_id'], timeout=45), 'Build 2 should finish building within the timeout.') slave.block_until_idle( timeout=20 ) # ensure slave teardown has finished before making assertions self._assert_build_completed_as_expected(build_1['build_id'], test_config, project_dir_1) self._assert_build_completed_as_expected(build_2['build_id'], test_config, project_dir_2)
def test_connect_slave_with_existing_dead_slave_creates_new_alive_instance( self): master = ClusterMaster() slave_registry = SlaveRegistry.singleton() master.connect_slave('existing-slave.turtles.gov', 10) existing_slave = slave_registry.get_slave( slave_id=None, slave_url='existing-slave.turtles.gov') existing_slave.set_is_alive(False) existing_slave_id = existing_slave.id connect_response = master.connect_slave('existing-slave.turtles.gov', 10) new_slave = slave_registry.get_slave( slave_url='existing-slave.turtles.gov') self.assertNotEqual( str(existing_slave_id), connect_response['slave_id'], 'The re-connected slave should have generated a new slave id.') self.assertTrue( new_slave.is_alive(use_cached=True), 'The new slave should have been marked as alive once instantiated.' ) self.assertEquals( 2, self.mock_slave_allocator.add_idle_slave.call_count, 'Expected slave to be added to the idle slaves list.')
def test_remove_slave_by_slave_instance_removes_slave_from_both_dicts( self): slave_registry = SlaveRegistry.singleton() slave1 = Slave('raphael.turtles.gov', 1) slave2 = Slave('leonardo.turtles.gov', 1) slave_registry.add_slave(slave1) slave_registry.add_slave(slave2) self.assertEqual( 2, len(slave_registry.get_all_slaves_by_id()), 'Exactly two slaves should be in the all_slaves_by_id dict.') self.assertEqual( 2, len(slave_registry.get_all_slaves_by_url()), 'Exactly two slaves should be in the all_slaves_by_url dict.') slave_registry.remove_slave(slave=slave1) self.assertEqual( 1, len(slave_registry.get_all_slaves_by_id()), 'Exactly one slave should be in the all_slaves_by_id dict after removing one slave.' ) self.assertEqual( 1, len(slave_registry.get_all_slaves_by_url()), 'Exactly one slave should be in the all_slaves_by_url dict after removing one slave.' )
def __init__(self): self._logger = get_logger(__name__) self._master_results_path = Configuration['results_directory'] self._slave_registry = SlaveRegistry.singleton() self._scheduler_pool = BuildSchedulerPool() self._build_request_handler = BuildRequestHandler(self._scheduler_pool) self._build_request_handler.start() self._slave_allocator = SlaveAllocator(self._scheduler_pool) self._slave_allocator.start() # The best practice for determining the number of threads to use is # the number of threads per core multiplied by the number of physical # cores. So for example, with 10 cores, 2 sockets and 2 per core, the # max would be 40. # # Currently we use threads for incrementing/decrementing slave executor # counts (lock acquisition) and tearing down the slave (network IO). 32 threads should be # plenty for these tasks. In the case of heavy load, the bottle neck will be the number # of executors, not the time it takes to lock/unlock the executor counts or the number of # teardown requests. Tweak the number to find the sweet spot if you feel this is the case. self._thread_pool_executor = ThreadPoolExecutor(max_workers=32) # Asynchronously delete (but immediately rename) all old builds when master starts. # Remove this if/when build numbers are unique across master starts/stops if os.path.exists(self._master_results_path): fs.async_delete(self._master_results_path) fs.create_dir(self._master_results_path) # Configure heartbeat tracking self._unresponsive_slaves_cleanup_interval = Configuration[ 'unresponsive_slaves_cleanup_interval'] self._hb_scheduler = sched.scheduler() SlavesCollector.register_slaves_metrics_collector( lambda: self._slave_registry.get_all_slaves_by_id().values())
def put(self, slave_id): new_slave_state = self.decoded_body.get('slave', {}).get('state') slave = SlaveRegistry.singleton().get_slave(slave_id=int(slave_id)) self._cluster_master.handle_slave_state_update(slave, new_slave_state) self._cluster_master.update_slave_last_heartbeat_time(slave) self._write_status({'slave': slave.api_representation()})
def test_get_slave_raises_exception_on_invalid_arguments(self, get_slave_kwargs): slave_registry = SlaveRegistry.singleton() slave1 = Slave('raphael.turtles.gov', 1) slave_registry.add_slave(slave1) with self.assertRaises(ValueError): slave_registry.get_slave(**get_slave_kwargs)
def test_handle_result_reported_from_slave_when_build_is_canceled(self): build_id = 1 slave_url = "url" build = Build(BuildRequest({})) self.patch('app.master.build.util') build.generate_project_type() build.cancel() self.patch_object(build, '_handle_subjob_payload') self.patch_object(build, '_mark_subjob_complete') master = ClusterMaster() slave_registry = SlaveRegistry.singleton() BuildStore._all_builds_by_id[build_id] = build slave_registry._all_slaves_by_url[slave_url] = Mock() mock_scheduler = self.mock_scheduler_pool.get(build) master.handle_result_reported_from_slave(slave_url, build_id, 1) self.assertEqual(build._handle_subjob_payload.call_count, 1, "Canceled builds should " "handle payload") self.assertEqual( build._mark_subjob_complete.call_count, 1, "Canceled builds should mark " "their subjobs complete") self.assertTrue( mock_scheduler.execute_next_subjob_or_free_executor.called)
def post(self): shutdown_all = self.decoded_body.get('shutdown_all') if shutdown_all: slaves_to_shutdown = SlaveRegistry.singleton().get_all_slaves_by_id().keys() else: slaves_to_shutdown = [int(slave_id) for slave_id in self.decoded_body.get('slaves')] self._cluster_master.set_shutdown_mode_on_slaves(slaves_to_shutdown)
def setUp(self): super().setUp() self.addCleanup(patch.stopall) self._patched_items = {} self._blacklist_methods_not_allowed_in_unit_tests() # Stub out a few library dependencies that launch subprocesses. self.patch('app.util.autoversioning.get_version').return_value = '0.0.0' self.patch('app.util.conf.base_config_loader.platform.node').return_value = self._fake_hostname if self._do_network_mocks: # requests.Session() also makes some subprocess calls on instantiation. self.patch('app.util.network.requests.Session') # Stub out Network.are_hosts_same() call with a simple string comparison. self.patch('app.util.network.Network.are_hosts_same', new=lambda host_a, host_b: host_a == host_b) # Reset singletons so that they get recreated for every test that uses them. Configuration.reset_singleton() UnhandledExceptionHandler.reset_singleton() SlaveRegistry.reset_singleton() # Explicitly initialize UnhandledExceptionHandler singleton here (on the main thread) since it sets up signal # handlers that must execute on the main thread. UnhandledExceptionHandler.singleton() MasterConfigLoader().configure_defaults(Configuration.singleton()) MasterConfigLoader().configure_postload(Configuration.singleton()) self.patch('app.util.conf.master_config_loader.MasterConfigLoader.load_from_config_file') # Reset counters Slave._slave_id_counter = Counter() Build._build_id_counter = Counter() analytics._event_id_generator = Counter() # Configure logging to go to stdout. This makes debugging easier by allowing us to see logs for failed tests. log.configure_logging('DEBUG') # Then stub out configure_logging so we don't end up logging to real files during testing. self.patch('app.util.log.configure_logging') # Set up TestHandler. This allows asserting on log messages in tests. self.log_handler = logbook.TestHandler(bubble=True) self.log_handler.push_application() self._base_setup_called = True
def test_updating_slave_to_nonexistent_state_should_raise_bad_request_error(self): master = ClusterMaster() slave_registry = SlaveRegistry.singleton() slave_url = 'raphael.turtles.gov' master.connect_slave(slave_url, 10) slave = slave_registry.get_slave(slave_url=slave_url) with self.assertRaises(BadRequestError): master.handle_slave_state_update(slave, 'NONEXISTENT_STATE')
def test_get_slave_raises_exception_on_slave_not_found(self, get_slave_kwargs): slave_registry = SlaveRegistry.singleton() slave1 = Slave('raphael.turtles.gov', 1) slave2 = Slave('leonardo.turtles.gov', 1) slave_registry.add_slave(slave1) slave_registry.add_slave(slave2) with self.assertRaises(ItemNotFoundError): slave_registry.get_slave(**get_slave_kwargs)
def get(self): response = { 'slaves': [ slave.api_representation() for slave in SlaveRegistry.singleton().get_all_slaves_by_id().values() ] } self.write(response)
def put(self, slave_id): new_slave_state = self.decoded_body.get('slave', {}).get('state') slave = SlaveRegistry.singleton().get_slave(slave_id=int(slave_id)) self._cluster_master.handle_slave_state_update(slave, new_slave_state) self._cluster_master.update_slave_last_heartbeat_time(slave) self._write_status({ 'slave': slave.api_representation() })
def test_remove_slave_raises_exception_on_invalid_arguments(self): slave_registry = SlaveRegistry.singleton() slave1 = Slave('raphael.turtles.gov', 1) slave_registry.add_slave(slave1) with self.assertRaises(ValueError): # Both arguments specified slave_registry.remove_slave(slave=slave1, slave_url=slave1.url) # No arguments specified slave_registry.remove_slave(slave=None, slave_url=None)
def test_connect_slave_adds_new_slave_if_slave_never_connected_before(self): master = ClusterMaster() slave_registry = SlaveRegistry.singleton() master.connect_slave('never-before-seen.turtles.gov', 10) self.assertEqual(1, len(slave_registry.get_all_slaves_by_id()), 'Exactly one slave should be registered with the master.') self.assertIsNotNone(slave_registry.get_slave(slave_id=None, slave_url='never-before-seen.turtles.gov'), 'Registered slave does not have the expected url.')
def test_updating_slave_to_shutdown_should_call_slave_set_shutdown_mode(self): master = ClusterMaster() slave_registry = SlaveRegistry.singleton() slave_url = 'raphael.turtles.gov' master.connect_slave(slave_url, 10) slave = slave_registry.get_slave(slave_url=slave_url) slave.set_shutdown_mode = Mock() master.handle_slave_state_update(slave, SlaveState.SHUTDOWN) slave.set_shutdown_mode.assert_called_once_with()
def test_add_slave_adds_slave_in_both_dicts(self): slave_registry = SlaveRegistry.singleton() slave1 = Slave('raphael.turtles.gov', 1) slave2 = Slave('leonardo.turtles.gov', 1) slave_registry.add_slave(slave1) slave_registry.add_slave(slave2) self.assertEqual(2, len(slave_registry.get_all_slaves_by_id()), 'Exactly two slaves should be in the all_slaves_by_id dict.') self.assertEqual(2, len(slave_registry.get_all_slaves_by_url()), 'Exactly two slaves should be in the all_slaves_by_url dict.')
def test_updating_slave_to_disconnected_state_should_reset_slave_current_build_id(self): master = ClusterMaster() slave_registry = SlaveRegistry.singleton() slave_url = 'raphael.turtles.gov' master.connect_slave(slave_url, num_executors=10) slave = slave_registry.get_slave(slave_url=slave_url) slave.current_build_id = 4 master.handle_slave_state_update(slave, SlaveState.DISCONNECTED) self.assertIsNone(slave.current_build_id)
def post(self): shutdown_all = self.decoded_body.get('shutdown_all') if shutdown_all: slaves_to_shutdown = SlaveRegistry.singleton( ).get_all_slaves_by_id().keys() else: slaves_to_shutdown = [ int(slave_id) for slave_id in self.decoded_body.get('slaves') ] self._cluster_master.set_shutdown_mode_on_slaves(slaves_to_shutdown)
def test_updating_slave_to_disconnected_state_should_mark_slave_as_dead(self): master = ClusterMaster() slave_registry = SlaveRegistry.singleton() slave_url = 'raphael.turtles.gov' master.connect_slave(slave_url, num_executors=10) slave = slave_registry.get_slave(slave_url=slave_url) self.assertTrue(slave.is_alive()) master.handle_slave_state_update(slave, SlaveState.DISCONNECTED) self.assertFalse(slave.is_alive())
def test_get_slave_returns_valid_slave(self): slave_registry = SlaveRegistry.singleton() slave1 = Slave('raphael.turtles.gov', 1) slave2 = Slave('leonardo.turtles.gov', 1) slave_registry.add_slave(slave1) slave_registry.add_slave(slave2) self.assertEquals(slave_registry.get_slave(slave_url=slave1.url), slave1, 'Get slave with url should return valid slave.') self.assertEquals(slave_registry.get_slave(slave_id=slave2.id), slave2, 'Get slave with id should return valid slave.')
def test_connect_slave_with_existing_dead_slave_removes_old_slave_entry_from_registry(self): master = ClusterMaster() slave_registry = SlaveRegistry.singleton() master.connect_slave('existing-slave.turtles.gov', 10) old_existing_slave = slave_registry.get_slave(slave_id=None, slave_url='existing-slave.turtles.gov') old_existing_slave_id = old_existing_slave.id connect_response = master.connect_slave('existing-slave.turtles.gov', 10) with self.assertRaises(ItemNotFoundError): slave_registry.get_slave(slave_id=old_existing_slave_id)
def test_connect_slave_with_existing_slave_running_build_cancels_build(self): master = ClusterMaster() slave_registry = SlaveRegistry.singleton() master.connect_slave('running-slave.turtles.gov', 10) build_mock = MagicMock(spec_set=Build) BuildStore._all_builds_by_id[1] = build_mock existing_slave = slave_registry.get_slave(slave_id=None, slave_url='running-slave.turtles.gov') existing_slave.current_build_id = 1 master.connect_slave('running-slave.turtles.gov', 10) self.assertTrue(build_mock.cancel.called, 'The build was not cancelled.')
def post(self, build_id, subjob_id): slave_url = self.decoded_body.get('slave') slave = SlaveRegistry.singleton().get_slave(slave_url=slave_url) file_payload = self.request.files.get('file') if not file_payload: raise RuntimeError('Result file not provided') slave_executor_id = self.decoded_body.get('metric_data', {}).get('executor_id') analytics.record_event(analytics.MASTER_RECEIVED_RESULT, executor_id=slave_executor_id, build_id=int(build_id), subjob_id=int(subjob_id), slave_id=slave.id) self._cluster_master.handle_result_reported_from_slave( slave_url, int(build_id), int(subjob_id), file_payload[0]) self._write_status()
def post(self, slave_id): slave = SlaveRegistry.singleton().get_slave(slave_id=int(slave_id)) # If the slave has been marked dead, but still sends heartbeat, the master does not update the last # heartbeat time and the method returns false. Additionally, master responds to the slave with slave # status. The slave will treat a is_alive=false response as a heartbeat failure, and die. # # The reason master returns the status to the slave instead of simply marking the slave as alive is # because the master or slave do not maintain an explicit state about when and why the slave was marked # dead. It is a lot cleaner for the heartbeat functionality to indicate an heartbeat failure and let the # slave make a decision based on that. is_alive = self._cluster_master.update_slave_last_heartbeat_time(slave) self.write({'is_alive': is_alive})
def test_exception_raised_during_complete_subjob_does_not_prevent_slave_teardown(self): slave_url = 'raphael.turtles.gov' mock_build = Mock(spec_set=Build, build_id=lambda: 777, is_finished=False) mock_build.complete_subjob.side_effect = [RuntimeError('Write failed')] master = ClusterMaster() slave_registry = SlaveRegistry.singleton() BuildStore._all_builds_by_id[mock_build.build_id()] = mock_build slave_registry._all_slaves_by_url[slave_url] = Mock() mock_scheduler = self.mock_scheduler_pool.get(mock_build) with self.assertRaisesRegex(RuntimeError, 'Write failed'): master.handle_result_reported_from_slave(slave_url, mock_build.build_id(), subjob_id=888) self.assertEqual(mock_scheduler.execute_next_subjob_or_free_executor.call_count, 1)
def test_remove_slave_by_slave_url_removes_slave_from_both_dicts(self): slave_registry = SlaveRegistry.singleton() slave1 = Slave('raphael.turtles.gov', 1) slave2 = Slave('leonardo.turtles.gov', 1) slave_registry.add_slave(slave1) slave_registry.add_slave(slave2) self.assertEqual(2, len(slave_registry.get_all_slaves_by_id()), 'Exactly two slaves should be in the all_slaves_by_id dict.') self.assertEqual(2, len(slave_registry.get_all_slaves_by_url()), 'Exactly two slaves should be in the all_slaves_by_url dict.') slave_registry.remove_slave(slave_url=slave1.url) self.assertEqual(1, len(slave_registry.get_all_slaves_by_id()), 'Exactly one slave should be in the all_slaves_by_id dict after removing one slave.') self.assertEqual(1, len(slave_registry.get_all_slaves_by_url()), 'Exactly one slave should be in the all_slaves_by_url dict after removing one slave.')
def test_connect_slave_with_existing_dead_slave_creates_new_alive_instance(self): master = ClusterMaster() slave_registry = SlaveRegistry.singleton() master.connect_slave('existing-slave.turtles.gov', 10) existing_slave = slave_registry.get_slave(slave_id=None, slave_url='existing-slave.turtles.gov') existing_slave.set_is_alive(False) existing_slave_id = existing_slave.id connect_response = master.connect_slave('existing-slave.turtles.gov', 10) new_slave = slave_registry.get_slave(slave_url='existing-slave.turtles.gov') self.assertNotEqual(str(existing_slave_id), connect_response['slave_id'], 'The re-connected slave should have generated a new slave id.') self.assertTrue(new_slave.is_alive(use_cached=True), 'The new slave should have been marked as alive once instantiated.') self.assertEquals(2, self.mock_slave_allocator.add_idle_slave.call_count, 'Expected slave to be added to the idle slaves list.')
def test_updating_slave_to_setup_completed_state_should_tell_build_to_begin_subjob_execution(self): master = ClusterMaster() slave_registry = SlaveRegistry.singleton() fake_build = MagicMock(spec_set=Build) master.get_build = MagicMock(return_value=fake_build) slave_url = 'raphael.turtles.gov' master.connect_slave(slave_url, 10) slave = slave_registry.get_slave(slave_url=slave_url) mock_scheduler = self.mock_scheduler_pool.get(fake_build) scheduler_begin_event = Event() mock_scheduler.begin_subjob_executions_on_slave.side_effect = lambda **_: scheduler_begin_event.set() master.handle_slave_state_update(slave, SlaveState.SETUP_COMPLETED) was_called = scheduler_begin_event.wait(timeout=5) self.assertTrue(was_called, 'scheduler.begin_subjob_executions_on_slave should be called in response ' 'to slave setup completing.') _, call_kwargs = mock_scheduler.begin_subjob_executions_on_slave.call_args self.assertEqual(call_kwargs.get('slave'), slave)
def test_handle_result_reported_from_slave_when_build_is_canceled(self): build_id = 1 slave_url = "url" build = Build(BuildRequest({})) self.patch('app.master.build.util') build.generate_project_type() build.cancel() self.patch_object(build, '_handle_subjob_payload') self.patch_object(build, '_mark_subjob_complete') master = ClusterMaster() slave_registry = SlaveRegistry.singleton() BuildStore._all_builds_by_id[build_id] = build slave_registry._all_slaves_by_url[slave_url] = Mock() mock_scheduler = self.mock_scheduler_pool.get(build) master.handle_result_reported_from_slave(slave_url, build_id, 1) self.assertEqual(build._handle_subjob_payload.call_count, 1, "Canceled builds should " "handle payload") self.assertEqual(build._mark_subjob_complete.call_count, 1, "Canceled builds should mark " "their subjobs complete") self.assertTrue(mock_scheduler.execute_next_subjob_or_free_executor.called)
def setUp(self): super().setUp() SlaveRegistry.reset_singleton()
def get(self, slave_id): slave = SlaveRegistry.singleton().get_slave(slave_id=int(slave_id)) response = {'slave': slave.api_representation()} self.write(response)
def get(self): response = { 'slaves': [slave.api_representation() for slave in SlaveRegistry.singleton().get_all_slaves_by_id().values()] } self.write(response)
def post(self, slave_id): slave = SlaveRegistry.singleton().get_slave(slave_id=int(slave_id)) self._cluster_master.update_slave_last_heartbeat_time(slave)
def get(self, slave_id): slave = SlaveRegistry.singleton().get_slave(slave_id=int(slave_id)) response = { 'slave': slave.api_representation() } self.write(response)