def set_key(self, key, value, overwrite=False, expiration=None): assert not self.is_canceller_only try: already_exists = self._client.exists(key) if already_exists and not overwrite: raise KeyError self._client.set(key, value, xx=overwrite) if expiration is not None: overwrite_expiring_key = self._client.exists( slash_join(key, REDIS_EXPIRING_SUFFIX) ) self._client.set( slash_join(key, REDIS_EXPIRING_SUFFIX), value, xx=overwrite_expiring_key, ex=expiration ) # Remove any expired key that might have previously been created but not removed # if a new expiration is set. self._client.delete(slash_join(key, REDIS_EXPIRED_SUFFIX)) key_event = KeyEvent.SET if already_exists else KeyEvent.CREATE self._publish(event=key_event, key=key, value=value) except redis.ConnectionError as rce: raise OrchestratorConnectionError(rce) except redis.RedisError as re: raise OrchestratorError(re)
async def _cancel_callback(self, key_change): if key_change.event not in (KeyEvent.CREATE, KeyEvent.SET): return build_uuid = key_change.value build_info = self._build_uuid_to_info.get(build_uuid, None) if build_info is None: logger.debug('No build info for "%s" job %s', key_change.event, build_uuid) return False lock_key = slash_join(self._canceled_lock_prefix, build_uuid, build_info.execution_id) lock_acquired = await self._orchestrator.lock(lock_key) if lock_acquired: builder_realm = build_info.component.builder_realm await self.kill_builder_executor(build_uuid) await self._orchestrator.delete_key(self._realm_key(builder_realm)) await self._orchestrator.delete_key(self._metric_key(builder_realm) ) await self._orchestrator.delete_key( slash_join(self._job_prefix, build_uuid)) # This is outside the lock so we can un-register the component wherever it is registered to. await build_info.component.cancel_build()
def set_key(self, key, value, overwrite=False, expiration=None): try: already_exists = self._client.exists(key) if already_exists and not overwrite: raise KeyError(key) # Set an expiration in case that the handler was not able to delete the the original key. # The extra leeway is so the expire event handler has time to get the original value and publish the event. self._client.set(key, value, xx=overwrite) if expiration is not None: self._client.expire(key, expiration + ONE_DAY) overwrite_expiring_key = self._client.exists( slash_join(key, REDIS_EXPIRING_SUFFIX)) # The "expiring/*" are only used to publish the EXPIRE event. A separate key is needed # because the the EXPIRE event does not include the original key value. self._client.set( slash_join(key, REDIS_EXPIRING_SUFFIX), "", xx=overwrite_expiring_key, ex=expiration, ) # Remove any expired key that might have previously been created but not removed # if a new expiration is set. self._client.delete(slash_join(key, REDIS_EXPIRED_SUFFIX)) key_event = KeyEvent.SET if already_exists else KeyEvent.CREATE self._publish(event=key_event, key=key, value=value) except redis.ConnectionError as rce: raise OrchestratorConnectionError(rce) except redis.RedisError as re: raise OrchestratorError(re)
def test_delete_key(orchestrator): key_prefix = "building/" with pytest.raises(KeyError): orchestrator.delete_key(slash_join(key_prefix, "key1")) orchestrator.set_key(slash_join(key_prefix, "key1"), "test_val") assert orchestrator.get_key(slash_join(key_prefix, "key1")) is not None orchestrator.delete_key(slash_join(key_prefix, "key1")) with pytest.raises(KeyError): orchestrator.get_key(slash_join(key_prefix, "key1"))
def test_get_key(orchestrator): key_prefix = "building/" with pytest.raises(KeyError): orchestrator.get_key(slash_join(key_prefix, "key1")) orchestrator.set_key(slash_join(key_prefix, "key1"), "test_val", overwrite=True) with pytest.raises(KeyError): orchestrator.get_key(slash_join(key_prefix, "key1")) orchestrator.set_key(slash_join(key_prefix, "key1"), "test_val") assert orchestrator.get_key(slash_join(key_prefix, "key1")) == "test_val"
def delete_key(self, key): assert not self.is_canceller_only try: value = self._client.get(key) if value is None: raise KeyError self._client.delete(key) self._client.delete(slash_join(key, REDIS_EXPIRING_SUFFIX)) self._client.delete(slash_join(key, REDIS_EXPIRED_SUFFIX)) self._publish(event=KeyEvent.DELETE, key=key, value=value.decode("utf-8")) except redis.ConnectionError as rce: raise OrchestratorConnectionError(rce) except redis.RedisError as re: raise OrchestratorError(re)
def test_set_key(orchestrator): some_key = "someprefix/somekey" # Setting overwrite if the key doesn't exists prevent it from being written orchestrator.set_key(some_key, "test_val", overwrite=True) with pytest.raises(KeyError): orchestrator.get_key(some_key) # Set some key/value orchestrator.set_key(some_key, "test_val_2") assert orchestrator.get_key(some_key) == "test_val_2" # Try overwriting some existing key without setting overwrite with pytest.raises(KeyError): orchestrator.set_key(some_key, "test_val_3") # Try overwriting some existing key with overwrite set. # Also expects a new expiration key to be created. orchestrator.set_key(some_key, "test_val_4", overwrite=True, expiration=360) assert orchestrator.get_key(some_key) == "test_val_4" assert orchestrator.get_key(slash_join(some_key, REDIS_EXPIRING_SUFFIX)) is not None
def validate_client_id_and_secret(self, http_client, url_scheme_and_hostname): # First: Verify that the github endpoint is actually Github by checking for the # X-GitHub-Request-Id here. api_endpoint = self._api_endpoint() result = http_client.get(api_endpoint, auth=(self.client_id(), self.client_secret()), timeout=5) if not 'X-GitHub-Request-Id' in result.headers: raise Exception( 'Endpoint is not a Github (Enterprise) installation') # Next: Verify the client ID and secret. # Note: The following code is a hack until such time as Github officially adds an API endpoint # for verifying a {client_id, client_secret} pair. This workaround was given to us # *by a Github Engineer* (Jan 8, 2015). # # TODO: Replace with the real API call once added. # # Hitting the endpoint applications/{client_id}/tokens/foo will result in the following # behavior IF the client_id is given as the HTTP username and the client_secret as the HTTP # password: # - If the {client_id, client_secret} pair is invalid in some way, we get a 401 error. # - If the pair is valid, then we get a 404 because the 'foo' token does not exists. validate_endpoint = slash_join( api_endpoint, 'applications/%s/tokens/foo' % self.client_id()) result = http_client.get(validate_endpoint, auth=(self.client_id(), self.client_secret()), timeout=5) return result.status_code == 404
def setUp(self): super(TestEphemeralLifecycle, self).setUp() EphemeralBuilderManager.EXECUTORS["test"] = self._create_mock_executor self.register_component_callback = Mock() self.unregister_component_callback = Mock() self.job_heartbeat_callback = Mock() self.job_complete_callback = AsyncWrapper(Mock()) self.manager = EphemeralBuilderManager( self.register_component_callback, self.unregister_component_callback, self.job_heartbeat_callback, self.job_complete_callback, "127.0.0.1", 30, ) self.manager.initialize({ "EXECUTOR": "test", "ORCHESTRATOR": { "MEM_CONFIG": None }, }) # Ensure that that the realm and building callbacks have been registered callback_keys = [key for key in self.manager._orchestrator.callbacks] self.assertIn(REALM_PREFIX, callback_keys) self.assertIn(JOB_PREFIX, callback_keys) self.mock_job = self._create_build_job() self.mock_job_key = slash_join("building", BUILD_UUID)
def _mark_job_incomplete(self, build_job, build_info): """ Marks a job as incomplete, in response to a failure to start or a timeout. """ executor_name = build_info.executor_name execution_id = build_info.execution_id logger.warning( 'Build executor failed to successfully boot with execution id %s', execution_id) # Take a lock to ensure that only one manager reports the build as incomplete for this # execution. lock_key = slash_join(self._expired_lock_prefix, build_job.build_uuid, execution_id) acquired_lock = yield From(self._orchestrator.lock(lock_key)) if acquired_lock: try: # Clean up the bookkeeping for the job. yield From( self._orchestrator.delete_key(self._job_key(build_job))) except KeyError: logger.debug( 'Could not delete job key %s; might have been removed already', build_job.build_uuid) logger.error( '[BUILD INTERNAL ERROR] Build ID: %s. Exec name: %s. Exec ID: %s', build_job.build_uuid, executor_name, execution_id) yield From( self.job_complete_callback(build_job, BuildJobResult.INCOMPLETE, executor_name, update_phase=True)) else: logger.debug('Did not get lock for job-expiration for job %s', build_job.build_uuid)
def _expiring_key_handler(self, message): try: message_tup = ( message.get("type"), message.get("pattern").decode("utf-8"), message.get("channel").decode("utf-8"), message.get("data").decode("utf-8"), ) if self._is_expired_keyspace_event(message_tup): # Get the value of the original key before the expiration happened. key = self._key_from_expiration(message_tup) expired_value = self._client.get(key) # Mark key as expired. This key is used to track post job cleanup in the callback, # to allow another manager to pickup the cleanup if this fails. self._client.set(slash_join(key, REDIS_EXPIRED_SUFFIX), expired_value) self._client.delete(key) except redis.ConnectionError: _sleep_orchestrator() except redis.RedisError as re: logger.exception( "Redis exception watching redis expirations: %s - %s", key, re) except Exception as e: logger.exception( "Unknown exception watching redis expirations: %s - %s", key, e) if self._is_expired_keyspace_event( message_tup) and expired_value is not None: for watched_key, callback in self._watched_keys.items(): if key.startswith(watched_key): callback(KeyChange(KeyEvent.EXPIRE, key, expired_value))
def delete_key(self, key): assert not self.is_canceller_only value = yield From(self._client.get(key)) yield From(self._client.delete(key)) yield From(self._client.delete(slash_join(key, REDIS_EXPIRING_SUFFIX))) yield From(self._publish(event=KeyEvent.DELETE, key=key, value=value))
def __init__( self, host="127.0.0.1", port=6379, password=None, db=0, cert_and_key=None, ca_cert=None, ssl=False, skip_keyspace_event_setup=False, canceller_only=False, **kwargs, ): self.is_canceller_only = canceller_only (cert, key) = tuple(cert_and_key) if cert_and_key is not None else (None, None) self._client = redis.StrictRedis( host=host, port=port, password=password, db=db, ssl_certfile=cert, ssl_keyfile=key, ssl_ca_certs=ca_cert, ssl=ssl, socket_connect_timeout=1, socket_timeout=2, health_check_interval=2, ) self._shutting_down = False self._watched_keys = {} self._pubsub_key = slash_join( kwargs.get("orchestrator_prefix", ""), REDIS_DEFAULT_PUBSUB_KEY ).lstrip("/") if not self.is_canceller_only: # sleep_time is not really calling time.sleep(). It is the socket's timeout value. # run_in_thread uses an event loop that uses a non-blocking `parse_response` of the PubSub object. # This means the event loop will return immedietely even if there are no new messages. # Setting a value other than the default 0 prevents that thread from exhausting CPU time. # https://github.com/andymccurdy/redis-py/issues/821 # Configure a subscription to watch events that the orchestrator manually publishes. logger.debug("creating pubsub with key %s", self._pubsub_key) self._pubsub = self._client.pubsub() self._pubsub.subscribe(**{self._pubsub_key: self._published_key_handler}) self._pubsub_thread = self._pubsub.run_in_thread(daemon=True, sleep_time=5) # Configure a subscription to watch expired keyspace events. if not skip_keyspace_event_setup: self._client.config_set( REDIS_KEYSPACE_EVENT_CONFIG_KEY, REDIS_KEYSPACE_EXPIRED_EVENT_CONFIG_VALUE ) self._pubsub_expiring = self._client.pubsub() self._pubsub_expiring.psubscribe( **{REDIS_EXPIRED_KEYSPACE_PATTERN % (db, "*"): self._expiring_key_handler} ) self._pubsub_expiring_thread = self._pubsub_expiring.run_in_thread(daemon=True, sleep_time=5)
def __init__( self, host="127.0.0.1", port=6379, password=None, db=0, cert_and_key=None, ca_cert=None, client_threads=5, ssl=False, skip_keyspace_event_setup=False, canceller_only=False, **kwargs, ): self.is_canceller_only = canceller_only (cert, key) = tuple(cert_and_key) if cert_and_key is not None else (None, None) self._sync_client = redis.StrictRedis( host=host, port=port, password=password, db=db, ssl_certfile=cert, ssl_keyfile=key, ssl_ca_certs=ca_cert, ssl=ssl, ) self._shutting_down = False self._tasks = {} self._watched_keys = {} self._pubsub_key = slash_join(kwargs.get("orchestrator_prefix", ""), REDIS_DEFAULT_PUBSUB_KEY).lstrip("/") if not self.is_canceller_only: (self._client, self._async_executor) = wrap_with_threadpool( self._sync_client, client_threads) # Configure a subscription to watch events that the orchestrator manually publishes. logger.debug("creating pubsub with key %s", self._pubsub_key) published_pubsub = self._sync_client.pubsub() published_pubsub.subscribe(self._pubsub_key) (self._pubsub, self._async_executor_pub) = wrap_with_threadpool(published_pubsub) self._watch_published_key() # Configure a subscription to watch expired keyspace events. if not skip_keyspace_event_setup: self._sync_client.config_set( REDIS_KEYSPACE_EVENT_CONFIG_KEY, REDIS_KEYSPACE_EVENT_CONFIG_VALUE) expiring_pubsub = self._sync_client.pubsub() expiring_pubsub.psubscribe(REDIS_EXPIRED_KEYSPACE_PATTERN % (db, "*")) (self._pubsub_expiring, self._async_executor_ex) = wrap_with_threadpool(expiring_pubsub) self._watch_expiring_key()
def try_cancel_build(self, build_uuid): logger.info("Cancelling build %s", build_uuid) cancel_key = slash_join(CANCEL_PREFIX, build_uuid) try: self._orchestrator.set_key_sync(cancel_key, build_uuid, expiration=60) return True except OrchestratorError: logger.exception("Failed to write cancel action to redis with uuid %s", build_uuid) return False
async def _setup_job_for_managers(self): test_component = Mock(spec=BuildComponent) test_component.builder_realm = REALM_ID test_component.start_build = Mock(side_effect=self._create_completed_future()) self.register_component_callback.return_value = test_component is_scheduled = await self.manager.schedule(self.mock_job) self.assertTrue(is_scheduled) self.assertEqual(self.test_executor.start_builder.call_count, 1) # Ensure that that the job, realm, and metric callbacks have been registered callback_keys = [key for key in self.manager._orchestrator.callbacks] self.assertIn(self.mock_job_key, self.manager._orchestrator.state) self.assertIn(REALM_PREFIX, callback_keys) # TODO: assert metric key has been set realm_for_build = self._find_realm_key(self.manager._orchestrator, BUILD_UUID) raw_realm_data = await ( self.manager._orchestrator.get_key(slash_join("realm", realm_for_build)) ) realm_data = json.loads(raw_realm_data) realm_data["realm"] = REALM_ID # Right now the job is not registered with any managers because etcd has not accepted the job self.assertEqual(self.register_component_callback.call_count, 0) # Fire off a realm changed with the same data. await ( self.manager._realm_callback( KeyChange( KeyEvent.CREATE, slash_join(REALM_PREFIX, REALM_ID), json.dumps(realm_data) ) ) ) # Ensure that we have at least one component node. self.assertEqual(self.register_component_callback.call_count, 1) self.assertEqual(1, self.manager.num_workers()) # Ensure that the build info exists. self.assertIsNotNone(self.manager._build_uuid_to_info.get(BUILD_UUID)) return test_component
def _metric_key(self, realm): """ Create a key which is used to track a job in the Orchestrator. :param realm: realm for the build :type realm: str :returns: key used to track jobs :rtype: str """ return slash_join(self._metric_prefix, realm)
def _job_key(self, build_job): """ Creates a key which is used to track a job in the Orchestrator. :param build_job: unique job identifier for a build :type build_job: str :returns: key used to track the job :rtype: str """ return slash_join(self._job_prefix, build_job.job_details["build_uuid"])
def validate_organization(self, organization_id, http_client): org_endpoint = slash_join(self._api_endpoint(), 'orgs/%s' % organization_id.lower()) result = http_client.get( org_endpoint, headers={'Accept': 'application/vnd.github.moondragon+json'}, timeout=5) return result.status_code == 200
def _realm_key(self, realm): """ Create a key which is used to track an incoming connection on a realm. :param realm: realm for the build :type realm: str :returns: key used to track the connection to the realm :rtype: str """ return slash_join(self._realm_prefix, realm)
def extract_dockerfile_args(self): dockerfile_path = self.build_config.get("build_subdir", "") context = self.build_config.get("context", "") if not (dockerfile_path == "" or context == ""): # This should not happen and can be removed when we centralize validating build_config dockerfile_abspath = slash_join("", dockerfile_path) if ".." in os.path.relpath(dockerfile_abspath, context): return os.path.split(dockerfile_path) dockerfile_path = os.path.relpath(dockerfile_abspath, context) return context, dockerfile_path
def extract_dockerfile_args(build_config): dockerfile_path = build_config.get('build_subdir', '') context = build_config.get('context', '') if not (dockerfile_path == '' or context == ''): # This should not happen and can be removed when we centralize validating build_config dockerfile_abspath = slash_join('', dockerfile_path) if ".." in os.path.relpath(dockerfile_abspath, context): return os.path.split(dockerfile_path) dockerfile_path = os.path.relpath(dockerfile_abspath, context) return context, dockerfile_path
def test_get_prefixed_keys(orchestrator): keys_to_generate = 10 key_prefix = "building/" generated_keys = set() for x in range(keys_to_generate): orchestrator.set_key(slash_join(key_prefix, str(x)), "test_val") generated_keys.add(slash_join(key_prefix, str(x))) assert len(orchestrator.get_prefixed_keys(key_prefix)) == keys_to_generate keys_to_remove = randrange(1, keys_to_generate) for x in range(keys_to_remove): orchestrator.delete_key(slash_join(key_prefix, str(x))) generated_keys.remove(slash_join(key_prefix, str(x))) assert len(orchestrator.get_prefixed_keys( key_prefix)) == keys_to_generate - keys_to_remove for k in generated_keys: orchestrator.delete_key(k) assert len(orchestrator.get_prefixed_keys(key_prefix)) == 0
async def set_key(self, key, value, overwrite=False, expiration=None): assert not self.is_canceller_only already_exists = await self._client.exists(key) await self._client.set(key, value, xx=overwrite) if expiration is not None: await (self._client.set(slash_join(key, REDIS_EXPIRING_SUFFIX), value, xx=overwrite, ex=expiration)) key_event = KeyEvent.SET if already_exists else KeyEvent.CREATE await self._publish(event=key_event, key=key, value=value)
def get_key(self, key): assert not self.is_canceller_only try: value = self._client.get(key) if value is None: # If expired, the expired key should have been removed but still exists. # Delete the key if that's the case. if self._key_is_expired(key): self._client.delete(slash_join(key, REDIS_EXPIRED_SUFFIX)) raise KeyError(key) except redis.ConnectionError as rce: raise OrchestratorConnectionError(rce) except redis.RedisError as re: raise OrchestratorError(re) return value.decode("utf-8")
def test_another_manager_takes_job(self): # Prepare a job to be taken by another manager test_component = await self._setup_job_for_managers() await ( self.manager._realm_callback( KeyChange( KeyEvent.DELETE, slash_join(REALM_PREFIX, REALM_ID), json.dumps( { "realm": REALM_ID, "token": "beef", "execution_id": "123", "job_queue_item": self.mock_job.job_item, } ), ) ) ) self.unregister_component_callback.assert_called_once_with(test_component) # Ensure that the executor does not kill the job. self.assertEqual(self.test_executor.stop_builder.call_count, 0) # Ensure that we still have the build info, but not the component. self.assertEqual(0, self.manager.num_workers()) self.assertIsNotNone(self.manager._build_uuid_to_info.get(BUILD_UUID)) # Delete the job once it has "completed". await ( self.manager._job_callback( KeyChange( KeyEvent.DELETE, self.mock_job_key, json.dumps({"had_heartbeat": False, "job_queue_item": self.mock_job.job_item}), ) ) ) # Ensure the job was removed from the info, but stop was not called. self.assertIsNone(self.manager._build_uuid_to_info.get(BUILD_UUID)) self.assertEqual(self.test_executor.stop_builder.call_count, 0)
def set_key(self, key, value, overwrite=False, expiration=None): preexisting_key = key in self.state if preexisting_key and not overwrite: raise KeyError(key) # Simulate redis' behavior when using xx and the key does not exist. if not preexisting_key and overwrite: return absolute_expiration = None if expiration is not None: absolute_expiration = datetime.datetime.now() + datetime.timedelta(seconds=expiration) self.state.set(key, value, expires=absolute_expiration) self.state.set(slash_join(key, REDIS_EXPIRING_SUFFIX), value, expires=absolute_expiration) event = KeyEvent.CREATE if not preexisting_key else KeyEvent.SET for callback in self._callbacks_prefixed(key): callback(KeyChange(event, key, value))
def set_key_sync(self, key, value, overwrite=False, expiration=None): already_exists = self._sync_client.exists(key) self._sync_client.set(key, value, xx=overwrite) if expiration is not None: self._sync_client.set( slash_join(key, REDIS_EXPIRING_SUFFIX), value, xx=overwrite, ex=expiration ) self._sync_client.publish( self._pubsub_key, json.dumps( { "event": int(KeyEvent.SET if already_exists else KeyEvent.CREATE), "key": key, "value": value, } ), )
def test_on_key_change(orchestrator): key_prefix = "building/" mock_callback = Mock() orchestrator.on_key_change(key_prefix, lambda x: mock_callback.meth(x)) # CREATE orchestrator.set_key(slash_join(key_prefix, "key1"), "test_val") time.sleep(0.1) mock_callback.meth.assert_called_with( KeyChange( KeyEvent.CREATE, slash_join(key_prefix, "key1"), "test_val", )) # SET orchestrator.set_key(slash_join(key_prefix, "key1"), "test_val", overwrite=True) time.sleep(0.1) mock_callback.meth.assert_called_with( KeyChange( KeyEvent.SET, slash_join(key_prefix, "key1"), "test_val", )) # DELETE orchestrator.delete_key(slash_join(key_prefix, "key1")) time.sleep(0.1) mock_callback.meth.assert_called_with( KeyChange( KeyEvent.DELETE, slash_join(key_prefix, "key1"), "test_val", ))
def _job_key(self, build_id): """Creates a key which is used to track a job in the Orchestrator.""" return slash_join(self._job_prefix, build_id)