def setUp(self): super(TestEphemeralLifecycle, self).setUp() EphemeralBuilderManager.EXECUTORS["test"] = self._create_mock_executor self.register_component_callback = Mock() self.unregister_component_callback = Mock() self.job_heartbeat_callback = Mock() self.job_complete_callback = AsyncWrapper(Mock()) self.manager = EphemeralBuilderManager( self.register_component_callback, self.unregister_component_callback, self.job_heartbeat_callback, self.job_complete_callback, "127.0.0.1", 30, ) self.manager.initialize({ "EXECUTOR": "test", "ORCHESTRATOR": { "MEM_CONFIG": None }, }) # Ensure that that the realm and building callbacks have been registered callback_keys = [key for key in self.manager._orchestrator.callbacks] self.assertIn(REALM_PREFIX, callback_keys) self.assertIn(JOB_PREFIX, callback_keys) self.mock_job = self._create_build_job() self.mock_job_key = slash_join("building", BUILD_UUID)
def __init__(self, build_logs, repository_build_uuid): self._current_phase = None self._current_command = None self._uuid = repository_build_uuid self._build_logs = AsyncWrapper(build_logs) self._sync_build_logs = build_logs self._build_model = AsyncWrapper(model.build) self._status = { 'total_commands': 0, 'current_command': None, 'push_completion': 0.0, 'pull_completion': 0.0, } # Write the initial status. self.__exit__(None, None, None)
def _get_conn(self): """ Creates an ec2 connection which can be used to manage instances. """ return AsyncWrapper( boto.ec2.connect_to_region( self.executor_config["EC2_REGION"], aws_access_key_id=self.executor_config["AWS_ACCESS_KEY"], aws_secret_access_key=self.executor_config["AWS_SECRET_KEY"], ))
class TestEphemeralLifecycle(EphemeralBuilderTestCase): """ Tests the various lifecycles of the ephemeral builder and its interaction with etcd. """ def __init__(self, *args, **kwargs): super(TestEphemeralLifecycle, self).__init__(*args, **kwargs) self.etcd_client_mock = None self.test_executor = None def _create_completed_future(self, result=None): def inner(*args, **kwargs): new_future = Future() new_future.set_result(result) return new_future return inner def _create_mock_executor(self, *args, **kwargs): self.test_executor = Mock(spec=BuilderExecutor) self.test_executor.start_builder = Mock( side_effect=self._create_completed_future("123")) self.test_executor.stop_builder = Mock( side_effect=self._create_completed_future()) self.test_executor.setup_time = 60 self.test_executor.name = "MockExecutor" self.test_executor.minimum_retry_threshold = 0 return self.test_executor def setUp(self): super(TestEphemeralLifecycle, self).setUp() EphemeralBuilderManager.EXECUTORS["test"] = self._create_mock_executor self.register_component_callback = Mock() self.unregister_component_callback = Mock() self.job_heartbeat_callback = Mock() self.job_complete_callback = AsyncWrapper(Mock()) self.manager = EphemeralBuilderManager( self.register_component_callback, self.unregister_component_callback, self.job_heartbeat_callback, self.job_complete_callback, "127.0.0.1", 30, ) self.manager.initialize({ "EXECUTOR": "test", "ORCHESTRATOR": { "MEM_CONFIG": None }, }) # Ensure that that the realm and building callbacks have been registered callback_keys = [key for key in self.manager._orchestrator.callbacks] self.assertIn(REALM_PREFIX, callback_keys) self.assertIn(JOB_PREFIX, callback_keys) self.mock_job = self._create_build_job() self.mock_job_key = slash_join("building", BUILD_UUID) def tearDown(self): super(TestEphemeralLifecycle, self).tearDown() self.manager.shutdown() @coroutine def _setup_job_for_managers(self): test_component = Mock(spec=BuildComponent) test_component.builder_realm = REALM_ID test_component.start_build = Mock( side_effect=self._create_completed_future()) self.register_component_callback.return_value = test_component is_scheduled = yield From(self.manager.schedule(self.mock_job)) self.assertTrue(is_scheduled) self.assertEqual(self.test_executor.start_builder.call_count, 1) # Ensure that that the job, realm, and metric callbacks have been registered callback_keys = [key for key in self.manager._orchestrator.callbacks] self.assertIn(self.mock_job_key, self.manager._orchestrator.state) self.assertIn(REALM_PREFIX, callback_keys) # TODO: assert metric key has been set realm_for_build = self._find_realm_key(self.manager._orchestrator, BUILD_UUID) raw_realm_data = yield From( self.manager._orchestrator.get_key( slash_join("realm", realm_for_build))) realm_data = json.loads(raw_realm_data) realm_data["realm"] = REALM_ID # Right now the job is not registered with any managers because etcd has not accepted the job self.assertEqual(self.register_component_callback.call_count, 0) # Fire off a realm changed with the same data. yield From( self.manager._realm_callback( KeyChange(KeyEvent.CREATE, slash_join(REALM_PREFIX, REALM_ID), json.dumps(realm_data)))) # Ensure that we have at least one component node. self.assertEqual(self.register_component_callback.call_count, 1) self.assertEqual(1, self.manager.num_workers()) # Ensure that the build info exists. self.assertIsNotNone(self.manager._build_uuid_to_info.get(BUILD_UUID)) raise Return(test_component) @staticmethod def _find_realm_key(orchestrator, build_uuid): for key, value in iteritems(orchestrator.state): if key.startswith(REALM_PREFIX): parsed_value = json.loads(value) body = json.loads(parsed_value["job_queue_item"]["body"]) if body["build_uuid"] == build_uuid: return parsed_value["realm"] continue raise KeyError @async_test def test_schedule_and_complete(self): # Test that a job is properly registered with all of the managers test_component = yield From(self._setup_job_for_managers()) # Take the job ourselves yield From(self.manager.build_component_ready(test_component)) self.assertIsNotNone(self.manager._build_uuid_to_info.get(BUILD_UUID)) # Finish the job yield From( self.manager.job_completed(self.mock_job, BuildJobResult.COMPLETE, test_component)) # Ensure that the executor kills the job. self.assertEqual(self.test_executor.stop_builder.call_count, 1) # Ensure the build information is cleaned up. self.assertIsNone(self.manager._build_uuid_to_info.get(BUILD_UUID)) self.assertEqual(0, self.manager.num_workers()) @async_test def test_another_manager_takes_job(self): # Prepare a job to be taken by another manager test_component = yield From(self._setup_job_for_managers()) yield From( self.manager._realm_callback( KeyChange( KeyEvent.DELETE, slash_join(REALM_PREFIX, REALM_ID), json.dumps({ "realm": REALM_ID, "token": "beef", "execution_id": "123", "job_queue_item": self.mock_job.job_item, }), ))) self.unregister_component_callback.assert_called_once_with( test_component) # Ensure that the executor does not kill the job. self.assertEqual(self.test_executor.stop_builder.call_count, 0) # Ensure that we still have the build info, but not the component. self.assertEqual(0, self.manager.num_workers()) self.assertIsNotNone(self.manager._build_uuid_to_info.get(BUILD_UUID)) # Delete the job once it has "completed". yield From( self.manager._job_callback( KeyChange( KeyEvent.DELETE, self.mock_job_key, json.dumps({ "had_heartbeat": False, "job_queue_item": self.mock_job.job_item }), ))) # Ensure the job was removed from the info, but stop was not called. self.assertIsNone(self.manager._build_uuid_to_info.get(BUILD_UUID)) self.assertEqual(self.test_executor.stop_builder.call_count, 0) @async_test def test_job_started_by_other_manager(self): # Ensure that that the building callbacks have been registered callback_keys = [key for key in self.manager._orchestrator.callbacks] self.assertIn(JOB_PREFIX, callback_keys) # Send a signal to the callback that the job has been created. yield From( self.manager._job_callback( KeyChange( KeyEvent.CREATE, self.mock_job_key, json.dumps({ "had_heartbeat": False, "job_queue_item": self.mock_job.job_item }), ))) # Ensure the create does nothing. self.assertEqual(self.test_executor.stop_builder.call_count, 0) @async_test def test_expiring_worker_not_started(self): # Ensure that that the building callbacks have been registered callback_keys = [key for key in self.manager._orchestrator.callbacks] self.assertIn(JOB_PREFIX, callback_keys) # Send a signal to the callback that a worker has expired yield From( self.manager._job_callback( KeyChange( KeyEvent.EXPIRE, self.mock_job_key, json.dumps({ "had_heartbeat": True, "job_queue_item": self.mock_job.job_item }), ))) # Since the realm was never registered, expiration should do nothing. self.assertEqual(self.test_executor.stop_builder.call_count, 0) @async_test def test_expiring_worker_started(self): test_component = yield From(self._setup_job_for_managers()) # Ensure that that the building callbacks have been registered callback_keys = [key for key in self.manager._orchestrator.callbacks] self.assertIn(JOB_PREFIX, callback_keys) yield From( self.manager._job_callback( KeyChange( KeyEvent.EXPIRE, self.mock_job_key, json.dumps({ "had_heartbeat": True, "job_queue_item": self.mock_job.job_item }), ))) self.test_executor.stop_builder.assert_called_once_with("123") self.assertEqual(self.test_executor.stop_builder.call_count, 1) @async_test def test_buildjob_deleted(self): test_component = yield From(self._setup_job_for_managers()) # Ensure that that the building callbacks have been registered callback_keys = [key for key in self.manager._orchestrator.callbacks] self.assertIn(JOB_PREFIX, callback_keys) # Send a signal to the callback that a worker has expired yield From( self.manager._job_callback( KeyChange( KeyEvent.DELETE, self.mock_job_key, json.dumps({ "had_heartbeat": False, "job_queue_item": self.mock_job.job_item }), ))) self.assertEqual(self.test_executor.stop_builder.call_count, 0) self.assertEqual(self.job_complete_callback.call_count, 0) self.assertIsNone(self.manager._build_uuid_to_info.get(BUILD_UUID)) @async_test def test_builder_never_starts(self): test_component = yield From(self._setup_job_for_managers()) # Ensure that that the building callbacks have been registered callback_keys = [key for key in self.manager._orchestrator.callbacks] self.assertIn(JOB_PREFIX, callback_keys) # Send a signal to the callback that a worker has expired yield From( self.manager._job_callback( KeyChange( KeyEvent.EXPIRE, self.mock_job_key, json.dumps({ "had_heartbeat": False, "job_queue_item": self.mock_job.job_item }), ))) self.test_executor.stop_builder.assert_called_once_with("123") self.assertEqual(self.test_executor.stop_builder.call_count, 1) # Ensure the job was marked as incomplete, with an update_phase to True (so the DB record and # logs are updated as well) yield From( self.job_complete_callback.assert_called_once_with( ANY, BuildJobResult.INCOMPLETE, "MockExecutor", update_phase=True)) @async_test def test_change_worker(self): # Send a signal to the callback that a worker key has been changed self.manager._job_callback( KeyChange(KeyEvent.SET, self.mock_job_key, "value")) self.assertEqual(self.test_executor.stop_builder.call_count, 0) @async_test def test_realm_expired(self): test_component = yield From(self._setup_job_for_managers()) # Send a signal to the callback that a realm has expired yield From( self.manager._realm_callback( KeyChange( KeyEvent.EXPIRE, self.mock_job_key, json.dumps({ "realm": REALM_ID, "execution_id": "foobar", "executor_name": "MockExecutor", "job_queue_item": { "body": '{"build_uuid": "fakeid"}' }, }), ))) # Ensure that the cleanup code for the executor was called. self.test_executor.stop_builder.assert_called_once_with("foobar") self.assertEqual(self.test_executor.stop_builder.call_count, 1)
def start_builder(self, realm, token, build_uuid): region = self.executor_config["EC2_REGION"] channel = self.executor_config.get("COREOS_CHANNEL", "stable") coreos_ami = self.executor_config.get("COREOS_AMI", None) if coreos_ami is None: get_ami_callable = partial(self._get_coreos_ami, region, channel) coreos_ami = yield From( self._loop.run_in_executor(None, get_ami_callable)) user_data = self.generate_cloud_config(realm, token, build_uuid, channel, self.manager_hostname) logger.debug("Generated cloud config for build %s: %s", build_uuid, user_data) ec2_conn = self._get_conn() ssd_root_ebs = boto.ec2.blockdevicemapping.BlockDeviceType( size=int(self.executor_config.get("BLOCK_DEVICE_SIZE", 48)), volume_type="gp2", delete_on_termination=True, ) block_devices = boto.ec2.blockdevicemapping.BlockDeviceMapping() block_devices["/dev/xvda"] = ssd_root_ebs interfaces = None if self.executor_config.get("EC2_VPC_SUBNET_ID", None) is not None: interface = boto.ec2.networkinterface.NetworkInterfaceSpecification( subnet_id=self.executor_config["EC2_VPC_SUBNET_ID"], groups=self.executor_config["EC2_SECURITY_GROUP_IDS"], associate_public_ip_address=True, ) interfaces = boto.ec2.networkinterface.NetworkInterfaceCollection( interface) try: reservation = yield From( ec2_conn.run_instances( coreos_ami, instance_type=self.executor_config["EC2_INSTANCE_TYPE"], key_name=self.executor_config.get("EC2_KEY_NAME", None), user_data=user_data, instance_initiated_shutdown_behavior="terminate", block_device_map=block_devices, network_interfaces=interfaces, )) except boto.exception.EC2ResponseError as ec2e: logger.exception("Unable to spawn builder instance") metric_queue.ephemeral_build_worker_failure.Inc() raise ec2e if not reservation.instances: raise ExecutorException("Unable to spawn builder instance.") elif len(reservation.instances) != 1: raise ExecutorException("EC2 started wrong number of instances!") launched = AsyncWrapper(reservation.instances[0]) # Sleep a few seconds to wait for AWS to spawn the instance. yield From(trollius.sleep(_TAG_RETRY_SLEEP)) # Tag the instance with its metadata. for i in range(0, _TAG_RETRY_COUNT): try: yield From( launched.add_tags({ "Name": "Quay Ephemeral Builder", "Realm": realm, "Token": token, "BuildUUID": build_uuid, })) except boto.exception.EC2ResponseError as ec2e: if ec2e.error_code == "InvalidInstanceID.NotFound": if i < _TAG_RETRY_COUNT - 1: logger.warning( "Failed to write EC2 tags for instance %s for build %s (attempt #%s)", launched.id, build_uuid, i, ) yield From(trollius.sleep(_TAG_RETRY_SLEEP)) continue raise ExecutorException("Unable to find builder instance.") logger.exception("Failed to write EC2 tags (attempt #%s)", i) logger.debug("Machine with ID %s started for build %s", launched.id, build_uuid) raise Return(launched.id)
class StatusHandler(object): """ Context wrapper for writing status to build logs. """ def __init__(self, build_logs, repository_build_uuid): self._current_phase = None self._current_command = None self._uuid = repository_build_uuid self._build_logs = AsyncWrapper(build_logs) self._sync_build_logs = build_logs self._build_model = AsyncWrapper(model.build) self._status = { 'total_commands': 0, 'current_command': None, 'push_completion': 0.0, 'pull_completion': 0.0, } # Write the initial status. self.__exit__(None, None, None) @coroutine def _append_log_message(self, log_message, log_type=None, log_data=None): log_data = log_data or {} log_data['datetime'] = str(datetime.datetime.now()) try: yield From( self._build_logs.append_log_message(self._uuid, log_message, log_type, log_data)) except RedisError: logger.exception('Could not save build log for build %s: %s', self._uuid, log_message) @coroutine def append_log(self, log_message, extra_data=None): if log_message is None: return yield From(self._append_log_message(log_message, log_data=extra_data)) @coroutine def set_command(self, command, extra_data=None): if self._current_command == command: raise Return() self._current_command = command yield From( self._append_log_message(command, self._build_logs.COMMAND, extra_data)) @coroutine def set_error(self, error_message, extra_data=None, internal_error=False, requeued=False): error_phase = BUILD_PHASE.INTERNAL_ERROR if internal_error and requeued else BUILD_PHASE.ERROR yield From(self.set_phase(error_phase)) extra_data = extra_data or {} extra_data['internal_error'] = internal_error yield From( self._append_log_message(error_message, self._build_logs.ERROR, extra_data)) @coroutine def set_phase(self, phase, extra_data=None): if phase == self._current_phase: raise Return(False) self._current_phase = phase yield From( self._append_log_message(phase, self._build_logs.PHASE, extra_data)) # Update the repository build with the new phase raise Return( self._build_model.update_phase_then_close(self._uuid, phase)) def __enter__(self): return self._status def __exit__(self, exc_type, value, traceback): try: self._sync_build_logs.set_status(self._uuid, self._status) except RedisError: logger.exception('Could not set status of build %s to %s', self._uuid, self._status)