def new_client(self, cmd, anon_ch): valid, msg, owner = self.check_new_params(cmd) if valid: try: api_server, key, secret, access_token, refresh_token = self.generate_client( cmd, owner) except ClientException as e: logger.error("Error generating client: {}".format(e)) ch = ClientsChannel(name=anon_ch) ch.put({'status': 'error', 'message': e.msg}) return None logger.debug("Client generated.") cl = Client( **{ 'tenant': cmd['tenant'], 'actor_id': cmd['actor_id'], 'worker_id': cmd['worker_id'], 'client_key': key, 'client_name': cmd['worker_id'], }) clients_store[cl.id] = cl logger.info("client generated and stored. client: {}".format(cl)) self.send_client(api_server, key, secret, access_token, refresh_token, anon_ch) else: m = 'Invalid command parameters: {}'.format(msg) logger.error(m) anon_ch.put({'status': 'error', 'message': m})
def __init__(self): self.secret = os.environ.get('_abaco_secret') self.ch = ClientsChannel() self.credentials = {} for tenant in get_tenants(): self.credentials[tenant] = {'username': os.environ.get('_abaco_{}_username'.format(tenant), ''), 'password': os.environ.get('_abaco_{}_password'.format(tenant), '')}
def process_worker_ch(tenant, worker_ch, actor_id, worker_id, actor_ch, ag_client): """ Target for a thread to listen on the worker channel for a message to stop processing. :param worker_ch: :return: """ global keep_running logger.info("Worker subscribing to worker channel...") while True: try: msg = worker_ch.get(timeout=2) except channelpy.ChannelTimeoutException: continue logger.debug("Received message in worker channel: {}".format(msg)) logger.debug("Type(msg)={}".format(type(msg))) if type(msg) == dict: value = msg.get('value', '') if value == 'status': # this is a health check, return 'ok' to the reply_to channel. logger.debug("received health check. returning 'ok'.") ch = msg['reply_to'] ch.put('ok') elif msg == 'stop': logger.info("Received stop message, stopping worker...") # first, delete an associated client # its possible this worker was not passed a client, # but if so, we need to delete it before shutting down. if ag_client: logger.info("Requesting client {} be deleted.".format( ag_client.api_key)) secret = os.environ.get('_abaco_secret') clients_ch = ClientsChannel() msg = clients_ch.request_delete_client( tenant=tenant, actor_id=actor_id, worker_id=worker_id, client_id=ag_client.api_key, secret=secret) if msg['status'] == 'ok': logger.info("Delete request completed successfully.") else: logger.error("Error deleting client. Message: {}".format( msg['message'])) else: logger.info( "Did not receive client. Not issuing delete. Exiting.") try: Worker.delete_worker(actor_id, worker_id) except WorkerException as e: logger.info( "Got WorkerException from delete_worker(). Exception: {}". format(e)) keep_running = False actor_ch.close() logger.info("Closing actor channel for actor: {}".format(actor_id)) logger.info("Worker is now exiting.") sys.exit()
def clean_up_clients_store(): logger.debug("top of clean_up_clients_store") secret = os.environ.get('_abaco_secret') if not secret: logger.error( "health.py not configured with _abaco_secret. exiting clean_up_clients_store." ) return None for k, client in clients_store.items(): wid = client.get('worker_id') if not wid: logger.error( "client object in clients_store without worker_id. client: {}". format(client)) continue tenant = client.get('tenant') if not tenant: logger.error( "client object in clients_store without tenant. client: {}". format(client)) continue actor_id = client.get('actor_id') if not actor_id: logger.error( "client object in clients_store without actor_id. client: {}". format(client)) continue client_key = client.get('client_key') if not client_key: logger.error( "client object in clients_store without client_key. client: {}" .format(client)) continue # check to see if the wid is the id of an actual worker: worker = get_worker(wid) if not worker: logger.info("worker {} is gone. deleting client {}.".format( wid, client)) clients_ch = ClientsChannel() msg = clients_ch.request_delete_client(tenant=tenant, actor_id=actor_id, worker_id=wid, client_id=client_key, secret=secret) if msg['status'] == 'ok': logger.info("Client delete request completed successfully for " "worker_id: {}, client_id: {}.".format( wid, client_key)) else: logger.error( "Error deleting client for " "worker_id: {}, client_id: {}. Message: {}".format( wid, msg['message'], client_key, msg)) else: logger.info("worker {} still here. ignoring client {}.".format( wid, client))
def delete_client(self, tenant, actor_id, worker_id, client_id, secret): clients_ch = ClientsChannel() msg = clients_ch.request_delete_client(tenant=tenant, actor_id=actor_id, worker_id=worker_id, client_id=client_id, secret=secret) if msg['status'] == 'ok': logger.info("Client delete request completed successfully for " "worker_id: {}, client_id: {}.".format(worker_id, client_id)) else: logger.error("Error deleting client for " "worker_id: {}, client_id: {}. Message: {}".format(worker_id, msg['message'], client_id, msg)) clients_ch.close()
def process_worker_ch(tenant, worker_ch, actor_id, actor_ch, ag_client): """ Target for a thread to listen on the worker channel for a message to stop processing. :param worker_ch: :return: """ global keep_running print("Worker subscribing to worker channel...") while True: try: msg = worker_ch.get(timeout=2) except channelpy.ChannelTimeoutException: continue print("Received message in worker channel: {}".format(msg)) print("Type(msg)={}".format(type(msg))) if type(msg) == dict: value = msg.get('value', '') if value == 'status': # this is a health check, return 'ok' to the reply_to channel. ch = msg['reply_to'] ch.put('ok') elif msg == 'stop': print("Received stop message, stopping worker...") # first, delete an associated client # its possible this worker was not passed a client, # but if so, we need to delete it before shutting down. if ag_client: print("Requesting client {} be deleted.".format(ag_client.api_key)) secret = os.environ.get('_abaco_secret') clients_ch = ClientsChannel() msg = clients_ch.request_delete_client(tenant=tenant, actor_id=actor_id, worker_id=worker_ch.name, client_id=ag_client.api_key, secret=secret) if msg['status'] == 'ok': print("Delete request completed successfully.") else: print("Error deleting client. Message: {}".format(msg['message'])) else: print("Did not receive client. Not issuing delete. Exiting.") try: Worker.delete_worker(actor_id, worker_ch.name) except WorkerException: pass keep_running = False actor_ch.close() sys.exit()
def __init__(self): self.secret = os.environ.get('_abaco_secret') ready = False i = 0 while not ready: try: self.ch = ClientsChannel() ready = True except RuntimeError as e: i = i + 1 if i > 10: raise e self.credentials = {} for tenant in get_tenants(): self.credentials[tenant] = { 'username': os.environ.get('_abaco_{}_username'.format(tenant), ''), 'password': os.environ.get('_abaco_{}_password'.format(tenant), '') }
def new_client(self, cmd, anon_ch): valid, msg, owner = self.check_new_params(cmd) if valid: try: api_server, key, secret, access_token, refresh_token = self.generate_client(cmd, owner) except ClientException as e: ch = ClientsChannel(name=anon_ch) ch.put({'status': 'error', 'message': e.msg}) return None cl = Client(**{'tenant': cmd['tenant'], 'actor_id': cmd['actor_id'], 'worker_id': cmd['worker_id'], 'client_key': key, 'client_name': cmd['worker_id'], }) clients_store[cl.id] = cl self.send_client(api_server, key, secret, access_token, refresh_token, anon_ch) else: anon_ch.put({'status': 'error', 'message': 'Invalid command parameters: {}'.format(msg)})
def client_generation(self, actor_id, worker_id, tenant): client_ch = ClientsChannel() try: client_msg = client_ch.request_client( tenant=tenant, actor_id=actor_id, worker_id=worker_id, secret=self.secret ) except Exception as e: logger.error("Got a ChannelTimeoutException trying to generate a client for " "actor_id: {}; worker_id: {}; exception: {}".format(actor_id, worker_id, e)) # put worker in an error state and return self.error_out_actor(actor_id, worker_id, "Abaco was unable to generate an OAuth client for a new " "worker for this actor. System administrators have been notified.") client_ch.close() Worker.update_worker_status(actor_id, worker_id, ERROR) logger.critical("Client generation FAILED.") raise e client_ch.close() if client_msg.get('status') == 'error': logger.error("Error generating client: {}".format(client_msg.get('message'))) self.error_out_actor(actor_id, worker_id, "Abaco was unable to generate an OAuth client for a new " "worker for this actor. System administrators have been notified.") Worker.update_worker_status(actor_id, worker_id, ERROR) raise SpawnerException("Error generating client") #TODO - clean up error message # else, client was generated successfully: else: logger.info("Got a client: {}, {}, {}".format(client_msg['client_id'], client_msg['access_token'], client_msg['refresh_token'])) return client_msg['client_id'], \ client_msg['access_token'], \ client_msg['refresh_token'], \ client_msg['api_server'], \ client_msg['client_secret']
def process(self, cmd): """Main spawner method for processing a command from the CommandChannel.""" logger.info("Spawner processing new command:{}".format(cmd)) actor_id = cmd['actor_id'] worker_ids = cmd['worker_ids'] image = cmd['image'] tenant = cmd['tenant'] stop_existing = cmd.get('stop_existing', True) num_workers = cmd.get('num', self.num_workers) logger.info("command params: actor_id: {} worker_ids: {} image: {} stop_existing: {} mum_workers: {}".format( actor_id, worker_ids, image, tenant, stop_existing, num_workers)) try: new_channels, anon_channels, new_workers = self.start_workers(actor_id, worker_ids, image, tenant, num_workers) except SpawnerException as e: # for now, start_workers will do clean up for a SpawnerException, so we just need # to return back to the run loop. logger.info("Spawner returning to main run loop.") return logger.info("Created new workers: {}".format(new_workers)) # stop any existing workers: if stop_existing: logger.info("Stopping existing workers: {}".format(worker_ids)) self.stop_workers(actor_id, worker_ids) # add workers to store first so that the records will be there when the workers go # to update their status if not stop_existing: # if we're not stopping the existing workers, we need to add each worker to the # actor's collection. for _, worker in new_workers.items(): logger.info("calling add_worker for worker: {}.".format(worker)) Worker.add_worker(actor_id, worker) else: # since we're stopping the existing workers, the actor's collection should just # be equal to the new_workers. workers_store[actor_id] = new_workers logger.info("workers_store set to new_workers: {}.".format(new_workers)) # Tell new worker to subscribe to the actor channel. # If abaco is configured to generate clients for the workers, generate them now # and send new workers their clients. generate_clients = Config.get('workers', 'generate_clients').lower() logger.info("Sending messages to new workers over anonymous channels to subscribe to inbox.") for idx, channel in enumerate(anon_channels): if generate_clients == 'true': worker_id = new_workers[list(new_workers)[idx]]['id'] logger.info("Getting client for worker number {}, id: {}".format(idx, worker_id)) client_ch = ClientsChannel() try: client_msg = client_ch.request_client(tenant=tenant, actor_id=actor_id, # new_workers is a dictionary of dictionaries; list(d) creates a # list of keys for a dictionary d. hence, the idx^th entry # of list(ner_workers) should be the key. worker_id=worker_id, secret=self.secret) except ChannelTimeoutException as e: logger.error("Got a ChannelTimeoutException trying to generate a client for " "actor_id: {}; worker_id: {}; exception: {}".format(actor_id, worker_id, e)) # put actor in an error state and return self.error_out_actor(actor_id, worker_id, "Abaco was unable to generate an OAuth client for a new " "worker for this actor. System administrators have been notified.") client_ch.close() return client_ch.close() # we need to ignore errors when generating clients because it's possible it is not set up for a specific # tenant. we log it instead. if client_msg.get('status') == 'error': logger.error("Error generating client: {}".format(client_msg.get('message'))) channel.put({'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'no'}) logger.debug("Sent OK message over anonymous worker channel.") # else, client was generated successfully: else: logger.info("Got a client: {}, {}, {}".format(client_msg['client_id'], client_msg['access_token'], client_msg['refresh_token'])) channel.put({'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'yes', 'client_id': client_msg['client_id'], 'client_secret': client_msg['client_secret'], 'access_token': client_msg['access_token'], 'refresh_token': client_msg['refresh_token'], 'api_server': client_msg['api_server'], }) logger.debug("Sent OK message AND client over anonymous worker channel.") else: logger.info("Not generating clients. Config value was: {}".format(generate_clients)) channel.put({'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'no'}) logger.debug("Sent OK message over anonymous worker channel.") # @TODO - # delete the anonymous channel from this thread but sleep first to avoid the race condition. time.sleep(1.5) channel.delete() # due to the race condition deleting channels (potentially before all workers have received all messages) # we put a sleep here. time.sleep(1) for ch in new_channels: try: # the new_channels are the spawnerworker channels so they can be deleted. ch.delete() except Exception as e: logger.error("Got exception trying to delete spawnerworker channel: {}".format(e)) logger.info("Done processing command.")
def client_generation(self, actor_id, worker_id, tenant): need_a_client = True client_attempts = 0 while need_a_client and client_attempts < 10: client_attempts = client_attempts + 1 # take a break between each subsequent attempt after the first one: if client_attempts > 1: time.sleep(2) client_ch = ClientsChannel() logger.debug( f"trying to generate a client for worker {worker_id}; attempt: {client_attempts}." ) try: client_msg = client_ch.request_client(tenant=tenant, actor_id=actor_id, worker_id=worker_id, secret=self.secret) except Exception as e: logger.error( "Got a ChannelTimeoutException trying to generate a client for " "actor_id: {}; worker_id: {}; exception: {}".format( actor_id, worker_id, e)) if client_attempts == 10: # Update - 4/2020: we do NOT set the actor to an error statewhen client generation fails because # put worker in an error state and return # self.error_out_actor(actor_id, worker_id, "Abaco was unable to generate an OAuth client for new " # "worker {} for this actor. System administrators have been " # "notified. Actor will be put in error state and " # "must be updated before it will process".format(worker_id)) # Worker.update_worker_status(actor_id, worker_id, ERROR) try: client_ch.close() except Exception as e: logger.debug( f"got exception trying to close the client_ch: {e}" ) self.kill_worker(actor_id, worker_id) logger.critical("Client generation FAILED.") raise e try: client_ch.close() except Exception as e: logger.debug( f"got exception trying to close the client_ch: {e}") if client_msg.get('status') == 'error': logger.error( "Error generating client; worker_id: {}; message: {}". format(worker_id, client_msg.get('message'))) # check to see if the error was an error that cannot be retried: if 'AgaveClientFailedDoNotRetry' in client_msg.get('message'): logger.debug( f"got AgaveClientFailedDoNotRetry in message for worker {worker_id}. " f"Giving up and setting attempts directly to 10.") client_attempts = 10 if client_attempts == 10: # Update - 4/2020: we do NOT set the actor to an error statewhen client generation fails because # this is not something the user has control over. # self.error_out_actor(actor_id, worker_id, "Abaco was unable to generate an OAuth client for new " # "worker {} for this actor. System administrators " # "have been notified. Actor will be put in error state and " # "must be updated before it will process " # "messages.".format(worker_id)) # Worker.update_worker_status(actor_id, worker_id, ERROR) try: client_ch.close() except Exception as e: logger.debug( f"got exception trying to close the client_ch: {e}" ) self.kill_worker(actor_id, worker_id) raise SpawnerException("Error generating client" ) #TODO - clean up error message # else, client was generated successfully: else: logger.info("Got a client: {}, {}, {}".format( client_msg['client_id'], client_msg['access_token'], client_msg['refresh_token'])) return client_msg['client_id'], \ client_msg['access_token'], \ client_msg['refresh_token'], \ client_msg['api_server'], \ client_msg['client_secret']
def process_worker_ch(tenant, worker_ch, actor_id, worker_id, actor_ch, ag_client): """ Target for a thread to listen on the worker channel for a message to stop processing. :param worker_ch: :return: """ global keep_running logger.info("Worker subscribing to worker channel...") while True: msg = worker_ch.get_one() logger.debug("Received message in worker channel: {}".format(msg)) logger.debug("Type(msg)={}".format(type(msg))) if type(msg) == dict: value = msg.get('value', '') if value == 'status': # this is a health check, return 'ok' to the reply_to channel. logger.debug("received health check. returning 'ok'.") ch = msg['reply_to'] ch.put('ok') # @TODO - # delete the anonymous channel from this thread but sleep first to avoid the race condition. time.sleep(1.5) ch.delete() # NOT doing this for now -- deleting entire anon channel instead (see above) # clean up the event queue on this anonymous channel. this should be fixed in channelpy. # ch._queue._event_queue elif msg == 'stop' or msg == 'stop-no-delete': logger.info("Worker with worker_id: {} (actor_id: {}) received stop message, " "stopping worker...".format(worker_id, actor_id)) # when an actor's image is updated, old workers are deleted while new workers are # created. Deleting the actor msg channel in this case leads to race conditions delete_actor_ch = True if msg == 'stop-no-delete': logger.info("Got stop-no-delete; will not delete actor_ch.") delete_actor_ch = False # first, delete an associated client # its possible this worker was not passed a client, # but if so, we need to delete it before shutting down. if ag_client: logger.info("Requesting client {} be deleted.".format(ag_client.api_key)) secret = os.environ.get('_abaco_secret') clients_ch = ClientsChannel() msg = clients_ch.request_delete_client(tenant=tenant, actor_id=actor_id, worker_id=worker_id, client_id=ag_client.api_key, secret=secret) if msg['status'] == 'ok': logger.info("Client delete request completed successfully for " "worker_id: {}, client_id: {}.".format(worker_id, ag_client.api_key)) else: logger.error("Error deleting client for " "worker_id: {}, client_id: {}. Message: {}".format(worker_id, msg['message'], ag_client.api_key)) clients_ch.close() else: logger.info("Did not receive client. Not issuing delete. Exiting.") try: Worker.delete_worker(actor_id, worker_id) except WorkerException as e: logger.info("Got WorkerException from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e)) keep_running = False # delete associated channels: if delete_actor_ch: actor_ch.delete() worker_ch.delete() logger.info("WorkerChannel deleted and ActorMsgChannel closed for actor: {} worker_id: {}".format(actor_id, worker_id)) logger.info("Worker with worker_id: {} is now exiting.".format(worker_id)) _thread.interrupt_main() logger.info("main thread interruptted.") os._exit()
class ClientGenerator(object): def __init__(self): self.secret = os.environ.get('_abaco_secret') self.ch = ClientsChannel() self.credentials = {} for tenant in get_tenants(): self.credentials[tenant] = {'username': os.environ.get('_abaco_{}_username'.format(tenant), ''), 'password': os.environ.get('_abaco_{}_password'.format(tenant), '')} def get_agave(self, tenant, actor_owner): """ Generate an agavepy client representing a specific user owning an actor. The `actor_owner` should be the username associated with the owner of the actor. """ # these are the credentials of the abaco service account. this account should have the abaco and # impersonator roles. username = self.credentials[tenant.upper()]['username'] password = self.credentials[tenant.upper()]['password'] if username == '' or password == '': raise ClientException('Client service credentials not defined for tenant {}'.format(tenant)) api_server = get_api_server(tenant) # generate an Agave client set up for admin_password representing the actor owner: return api_server,\ Agave(api_server=api_server, username=username, password=password, token_username=actor_owner) def run(self): """ Listen to the clients channel for new client and deletion requests. Requests use the put_sync method to send an anonymous channel together with the actual client request command. """ while True: message = self.ch.get() print("cleintg processing message: {}".format(message)) anon_ch = message['reply_to'] cmd = message['value'] print("cleintg processing cmd: {}".format(cmd)) if cmd.get('command') == 'new': self.new_client(cmd, anon_ch) elif cmd.get('command') == 'delete': self.delete_client(cmd, anon_ch) else: anon_ch.put({'status': 'error', 'message': 'Received invalid command: {}'.format(cmd.get('command'))}) def new_client(self, cmd, anon_ch): valid, msg, owner = self.check_new_params(cmd) if valid: try: api_server, key, secret, access_token, refresh_token = self.generate_client(cmd, owner) except ClientException as e: ch = ClientsChannel(name=anon_ch) ch.put({'status': 'error', 'message': e.msg}) return None cl = Client(**{'tenant': cmd['tenant'], 'actor_id': cmd['actor_id'], 'worker_id': cmd['worker_id'], 'client_key': key, 'client_name': cmd['worker_id'], }) clients_store[cl.id] = cl self.send_client(api_server, key, secret, access_token, refresh_token, anon_ch) else: anon_ch.put({'status': 'error', 'message': 'Invalid command parameters: {}'.format(msg)}) def generate_client(self, cmd, owner): api_server, ag = self.get_agave(cmd['tenant'], actor_owner=owner) ag.clients.create(body={'clientName': cmd['worker_id']}) # note - the client generates tokens representing the user who registered the actor return api_server,\ ag.api_key, \ ag.api_secret, \ ag.token.token_info['access_token'], \ ag.token.token_info['refresh_token'] def send_client(self, api_server, client_id, client_secret, access_token, refresh_token, anon_ch): """Send client credentials to a worker on an anonymous channel.""" msg = {'status': 'ok', 'api_server': api_server, 'client_id': client_id, 'client_secret': client_secret, 'access_token': access_token, 'refresh_token': refresh_token} anon_ch.put(msg) def check_common(self, cmd): # validate the secret if not cmd.get('secret') == self.secret: return False, 'Invalid secret.' # validate tenant if not cmd.get('tenant') in get_tenants(): return False, 'Invalid client passed: {}'.format(cmd.get('tenant')) return True, '' def check_new_params(self, cmd): valid, msg = self.check_common(cmd) # validate the actor_id try: actor = Actor.from_db(actors_store[cmd.get('actor_id')]) except KeyError: return False, "Unable to look up actor with id: {}".format(cmd.get('actor_id')), None # validate the worker id try: Worker.get_worker(actor_id=cmd.get('actor_id'), ch_name=cmd.get('worker_id')) except WorkerException as e: return False, "Unable to look up worker: {}".format(e.msg), None return valid, msg, actor.owner def check_del_params(self, cmd): valid, msg = self.check_common(cmd) if not cmd.get('client_id'): return False, 'client_id parameter required.', None # It's possible the actor record has been deleted so we need to remove the client based solely on # the information on the command. # also, agave owner doesn't matter on delete since we are only using the service account (baseic auth). return valid, msg, 'abaco_service' def delete_client(self, cmd, anon_ch): valid, msg, owner = self.check_del_params(cmd) if not valid: anon_ch.put({'status': 'error', 'message': 'Invalid parameters sent: {}'.format(msg)}) return None try: _, ag = self.get_agave(cmd['tenant'], owner) except ClientException as e: anon_ch.put({'status': 'error', 'message': 'Could not generate an Agave client: {}'.format(e)}) return None # remove the client from APIM try: ag.clients.delete(clientName=cmd['worker_id']) except Exception as e: anon_ch.put({'status': 'error', 'message': 'Not able to delete client from APIM. Exception: {}'.format(e)}) return None # remove the client from the abaco db try: Client.delete_client(tenant=cmd['tenant'], client_key=cmd['client_id']) except Exception as e: anon_ch.put({'status': 'error', 'message': 'Not able to delete client from abaco db. Exception: {}'.format(e)}) return None anon_ch.put({'status': 'ok', 'message': 'Client deleted.'})
class ClientGenerator(object): def __init__(self): self.secret = os.environ.get('_abaco_secret') self.ch = ClientsChannel() self.credentials = {} for tenant in get_tenants(): self.credentials[tenant] = { 'username': os.environ.get('_abaco_{}_username'.format(tenant), ''), 'password': os.environ.get('_abaco_{}_password'.format(tenant), '') } def get_agave(self, tenant, actor_owner): """ Generate an agavepy client representing a specific user owning an actor. The `actor_owner` should be the username associated with the owner of the actor. """ # these are the credentials of the abaco service account. this account should have the abaco and # impersonator roles. username = self.credentials[tenant.upper()]['username'] password = self.credentials[tenant.upper()]['password'] if username == '' or password == '': msg = 'Client service credentials not defined for tenant {}'.format( tenant) logger.error((msg)) raise ClientException(msg) api_server = get_api_server(tenant) # generate an Agave client set up for admin_password representing the actor owner: logger.info("Attempting to generate an agave client.") return api_server,\ Agave(api_server=api_server, username=username, password=password, token_username=actor_owner) def run(self): """ Listen to the clients channel for new client and deletion requests. Requests use the put_sync method to send an anonymous channel together with the actual client request command. """ while True: message = self.ch.get() logger.info("cleintg processing message: {}".format(message)) anon_ch = message['reply_to'] cmd = message['value'] if cmd.get('command') == 'new': logger.debug("calling new_client().") self.new_client(cmd, anon_ch) elif cmd.get('command') == 'delete': logger.debug("calling delete_client().") self.delete_client(cmd, anon_ch) else: msg = 'Received invalid command: {}'.format(cmd.get('command')) logger.error(msg) anon_ch.put({'status': 'error', 'message': msg}) def new_client(self, cmd, anon_ch): valid, msg, owner = self.check_new_params(cmd) if valid: try: api_server, key, secret, access_token, refresh_token = self.generate_client( cmd, owner) except ClientException as e: logger.error("Error generating client: {}".format(e)) ch = ClientsChannel(name=anon_ch) ch.put({'status': 'error', 'message': e.msg}) return None logger.debug("Client generated.") cl = Client( **{ 'tenant': cmd['tenant'], 'actor_id': cmd['actor_id'], 'worker_id': cmd['worker_id'], 'client_key': key, 'client_name': cmd['worker_id'], }) clients_store[cl.id] = cl logger.info("client generated and stored. client: {}".format(cl)) self.send_client(api_server, key, secret, access_token, refresh_token, anon_ch) else: m = 'Invalid command parameters: {}'.format(msg) logger.error(m) anon_ch.put({'status': 'error', 'message': m}) def generate_client(self, cmd, owner): api_server, ag = self.get_agave(cmd['tenant'], actor_owner=owner) ag.clients.create(body={'clientName': cmd['worker_id']}) # note - the client generates tokens representing the user who registered the actor logger.info("ag.clients.create successful.") return api_server,\ ag.api_key, \ ag.api_secret, \ ag.token.token_info['access_token'], \ ag.token.token_info['refresh_token'] def send_client(self, api_server, client_id, client_secret, access_token, refresh_token, anon_ch): """Send client credentials to a worker on an anonymous channel.""" logger.info( "sending client credentials for client: {} to channel: {}".format( client_id, anon_ch)) msg = { 'status': 'ok', 'api_server': api_server, 'client_id': client_id, 'client_secret': client_secret, 'access_token': access_token, 'refresh_token': refresh_token } anon_ch.put(msg) def check_common(self, cmd): """Common check for new and delete client requests.""" # validate the secret if not cmd.get('secret') == self.secret: m = 'Invalid secret.' logger.error(m) return False, m # validate tenant if not cmd.get('tenant') in get_tenants(): m = 'Invalid client passed: {}'.format(cmd.get('tenant')) logger.error(m) return False, m logger.debug("common params were valid.") return True, '' def check_new_params(self, cmd): """Additional checks for new client requests.""" valid, msg = self.check_common(cmd) # validate the actor_id try: actor = Actor.from_db(actors_store[cmd.get('actor_id')]) except KeyError: m = "Unable to look up actor with id: {}".format( cmd.get('actor_id')) logger.error(m) return False, m, None # validate the worker id try: Worker.get_worker(actor_id=cmd.get('actor_id'), worker_id=cmd.get('worker_id')) except WorkerException as e: m = "Unable to look up worker: {}".format(e.msg) logger.error(m) return False, m, None logger.debug("new params were valid.") return valid, msg, actor.owner def check_del_params(self, cmd): """Additional checks for delete client requests.""" valid, msg = self.check_common(cmd) if not cmd.get('client_id'): m = 'client_id parameter required.' logger.error(m) return False, m, None # It's possible the actor record has been deleted so we need to remove the client based solely on # the information on the command. # also, agave owner doesn't matter on delete since we are only using the service account (basic auth). logger.debug("del params were valid.") return valid, msg, 'abaco_service' def delete_client(self, cmd, anon_ch): valid, msg, owner = self.check_del_params(cmd) if not valid: anon_ch.put({ 'status': 'error', 'message': 'Invalid parameters sent: {}'.format(msg) }) return None try: _, ag = self.get_agave(cmd['tenant'], owner) except ClientException as e: m = 'Could not generate an Agave client: {}'.format(e) logger.error(m) anon_ch.put({'status': 'error', 'message': m}) return None # remove the client from APIM try: ag.clients.delete(clientName=cmd['worker_id']) except Exception as e: m = 'Not able to delete client from APIM. Exception: {}'.format(e) logger.error(m) anon_ch.put({'status': 'error', 'message': m}) return None # remove the client from the abaco db try: Client.delete_client(tenant=cmd['tenant'], client_key=cmd['client_id']) except Exception as e: m = 'Not able to delete client from abaco db. Exception: {}'.format( e) logger.error(m) anon_ch.put({'status': 'error', 'message': m}) return None logger.info("client deleted successfully.") anon_ch.put({'status': 'ok', 'message': 'Client deleted.'})
def process(self, cmd): print("Processing cmd:{}".format(str(cmd))) actor_id = cmd['actor_id'] worker_ids = cmd['worker_ids'] image = cmd['image'] tenant = cmd['tenant'] stop_existing = cmd.get('stop_existing', True) num_workers = cmd.get('num', self.num_workers) print("Actor id:{}".format(actor_id)) try: new_channels, anon_channels, new_workers = self.start_workers( actor_id, worker_ids, image, tenant, num_workers) except SpawnerException as e: # for now, start_workers will do clean up for a SpawnerException, so we just need # to return back to the run loop. return print("Created new workers: {}".format(str(new_workers))) # stop any existing workers: if stop_existing: self.stop_workers(actor_id, worker_ids) # add workers to store first so that the records will be there when the workers go # to update their status if not stop_existing: for _, worker in new_workers.items(): Worker.add_worker(actor_id, worker) else: workers_store[actor_id] = new_workers # Tell new worker to subscribe to the actor channel. # If abaco is configured to generate clients for the workers, generate them now # and send new workers their clients. generate_clients = Config.get('workers', 'generate_clients').lower() for idx, channel in enumerate(anon_channels): if generate_clients == 'true': print("Getting client for worker {}".format(idx)) client_ch = ClientsChannel() client_msg = client_ch.request_client( tenant=tenant, actor_id=actor_id, # new_workers is a dictionary of dictionaries; list(d) creates a # list of keys for a dictionary d. hence, the idx^th entry # of list(ner_workers) should be the key. worker_id=new_workers[list(new_workers)[idx]]['id'], secret=self.secret) # we need to ignore errors when generating clients because it's possible it is not set up for a specific # tenant. we log it instead. if client_msg.get('status') == 'error': print("Error generating client: {}".format( client_msg.get('message'))) channel.put({ 'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'no' }) # else, client was generated successfully: else: print("Got a client: {}, {}, {}".format( client_msg['client_id'], client_msg['access_token'], client_msg['refresh_token'])) channel.put({ 'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'yes', 'client_id': client_msg['client_id'], 'client_secret': client_msg['client_secret'], 'access_token': client_msg['access_token'], 'refresh_token': client_msg['refresh_token'], 'api_server': client_msg['api_server'], }) else: print("Not generating clients. Config value was: {}".format( generate_clients)) channel.put({ 'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'no' }) print("Done processing command.")
def process(self, cmd): print("Processing cmd:{}".format(str(cmd))) actor_id = cmd['actor_id'] image = cmd['image'] tenant = cmd['tenant'] stop_existing = cmd.get('stop_existing', True) num_workers = cmd.get('num', self.num_workers) print("Actor id:{}".format(actor_id)) try: new_channels, anon_channels, new_workers = self.start_workers(actor_id, image, tenant, num_workers) except SpawnerException as e: # for now, start_workers will do clean up for a SpawnerException, so we just need # to return back to the run loop. return print("Created new workers: {}".format(str(new_workers))) # stop any existing workers: if stop_existing: self.stop_workers(actor_id) # add workers to store first so that the records will be there when the workers go # to update their status if not stop_existing: for _, worker in new_workers.items(): Worker.add_worker(actor_id, worker) else: workers_store[actor_id] = new_workers # send new workers their clients and tell them to subscribe to the actor channel. for idx, channel in enumerate(anon_channels): print("Getting client for worker {}".format(idx)) client_ch = ClientsChannel() client_msg = client_ch.request_client(tenant=tenant, actor_id=actor_id, # new_workers is a dictionary of dictionaries; list(d) creates a # list of keys for a dictionary d. hence, the idx^th entry # of list(ner_workers) should be the key. worker_id=new_workers[list(new_workers)[idx]]['ch_name'], secret=self.secret) # we need to ignore errors when generating clients because it's possible it is not set up for a specific # tenant. we log it instead. if client_msg.get('status') == 'error': print("Error generating client: {}".format(client_msg.get('message'))) channel.put({'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'no'}) # else, client was generated successfully: else: print("Got a client: {}, {}, {}".format(client_msg['client_id'], client_msg['access_token'], client_msg['refresh_token'])) channel.put({'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'yes', 'client_id': client_msg['client_id'], 'client_secret': client_msg['client_secret'], 'access_token': client_msg['access_token'], 'refresh_token': client_msg['refresh_token'], 'api_server': client_msg['api_server'], }) print("Done processing command.")
def process_worker_ch(tenant, worker_ch, actor_id, worker_id, actor_ch, ag_client): """ Target for a thread to listen on the worker channel for a message to stop processing. :param worker_ch: :return: """ global keep_running logger.info("Worker subscribing to worker channel...") while keep_running: msg, msg_obj = worker_ch.get_one() # receiving the message is enough to ack it - resiliency is currently handled in the calling code. msg_obj.ack() logger.debug("Received message in worker channel: {}".format(msg)) logger.debug("Type(msg)={}".format(type(msg))) if type(msg) == dict: value = msg.get('value', '') if value == 'status': # this is a health check, return 'ok' to the reply_to channel. logger.debug("received health check. returning 'ok'.") ch = msg['reply_to'] ch.put('ok') # @TODO - # delete the anonymous channel from this thread but sleep first to avoid the race condition. time.sleep(1.5) ch.delete() # NOT doing this for now -- deleting entire anon channel instead (see above) # clean up the event queue on this anonymous channel. this should be fixed in channelpy. # ch._queue._event_queue elif msg == 'force_quit': logger.info( "Worker with worker_id: {} (actor_id: {}) received a force_quit message, " "forcing the execution to halt...".format(worker_id, actor_id)) globals.force_quit = True elif msg == 'stop' or msg == 'stop-no-delete': logger.info( "Worker with worker_id: {} (actor_id: {}) received stop message, " "stopping worker...".format(worker_id, actor_id)) # set the worker status to SHUTTING_DOWN: try: Worker.update_worker_status(actor_id, worker_id, SHUTTING_DOWN) except Exception as e: logger.error( f"worker got exception trying to update status to SHUTTING_DOWN. actor_id: {actor_id};" f"worker_id: {worker_id}; exception: {e}") globals.keep_running = False # when an actor's image is updated, old workers are deleted while new workers are # created. Deleting the actor msg channel in this case leads to race conditions delete_actor_ch = True if msg == 'stop-no-delete': logger.info("Got stop-no-delete; will not delete actor_ch.") delete_actor_ch = False # if a `stop` was sent, the actor is being deleted, and so we want to immediately shutdown processing. else: globals.force_quit = True # first, delete an associated client # its possible this worker was not passed a client, # but if so, we need to delete it before shutting down. if ag_client: logger.info("Requesting client {} be deleted.".format( ag_client.api_key)) secret = os.environ.get('_abaco_secret') clients_ch = ClientsChannel() msg = clients_ch.request_delete_client( tenant=tenant, actor_id=actor_id, worker_id=worker_id, client_id=ag_client.api_key, secret=secret) if msg['status'] == 'ok': logger.info( "Client delete request completed successfully for " "worker_id: {}, client_id: {}.".format( worker_id, ag_client.api_key)) else: logger.error( "Error deleting client for " "worker_id: {}, client_id: {}. Message: {}".format( worker_id, msg['message'], ag_client.api_key)) clients_ch.close() else: logger.info( "Did not receive client. Not issuing delete. Exiting.") try: Worker.delete_worker(actor_id, worker_id) except WorkerException as e: logger.info("Got WorkerException from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e)) # delete associated channels: # it is possible the actor channel was already deleted, in which case we just keep processing if delete_actor_ch: try: actor_ch.delete() logger.info( "ActorChannel deleted for actor: {} worker_id: {}". format(actor_id, worker_id)) except Exception as e: logger.info( "Got exception deleting ActorChannel for actor: {} " "worker_id: {}; exception: {}".format( actor_id, worker_id, e)) try: worker_ch.delete() logger.info( "WorkerChannel deleted for actor: {} worker_id: {}".format( actor_id, worker_id)) except Exception as e: logger.info( "Got exception deleting WorkerChannel for actor: {} " "worker_id: {}; exception: {}".format( actor_id, worker_id, e)) logger.info( "Worker with worker_id: {} is now exiting.".format(worker_id)) _thread.interrupt_main() logger.info("main thread interrupted, issuing os._exit()...") os._exit(0)
class ClientGenerator(object): def __init__(self): self.secret = os.environ.get('_abaco_secret') ready = False i = 0 while not ready: try: self.ch = ClientsChannel() ready = True except RuntimeError as e: i = i + 1 if i > 10: raise e self.credentials = {} for tenant in get_tenants(): self.credentials[tenant] = { 'username': os.environ.get('_abaco_{}_username'.format(tenant), ''), 'password': os.environ.get('_abaco_{}_password'.format(tenant), '') } def get_agave(self, tenant, actor_owner): """ Generate an agavepy client representing a specific user owning an actor. The `actor_owner` should be the username associated with the owner of the actor. """ # these are the credentials of the abaco service account. this account should have the abaco and # impersonator roles. username = self.credentials[tenant.upper()]['username'] password = self.credentials[tenant.upper()]['password'] if username == '' or password == '': msg = 'Client service credentials not defined for tenant {}'.format( tenant) logger.error(msg) raise ClientException(msg) api_server = get_api_server(tenant) verify = get_tenant_verify(tenant) # generate an Agave client set up for admin_password representing the actor owner: logger.info("Attempting to generate an agave client.") try: return api_server, Agave(api_server=api_server, username=username, password=password, token_username=actor_owner, verify=verify) except Exception as e: msg = "Got exception trying to instantiate Agave object; exception: {}".format( e) logger.error(msg) raise ClientException(msg) def run(self): """ Listen to the clients channel for new client and deletion requests. Requests use the put_sync method to send an anonymous channel together with the actual client request command. """ while True: message, msg_obj = self.ch.get_one() # we directly ack messages from the clients channel because caller expects direct reply_to msg_obj.ack() logger.info("clientg processing message: {}".format(message)) anon_ch = message['reply_to'] cmd = message['value'] if cmd.get('command') == 'new': logger.debug("calling new_client().") self.new_client(cmd, anon_ch) elif cmd.get('command') == 'delete': logger.debug("calling delete_client().") self.delete_client(cmd, anon_ch) else: msg = 'Received invalid command: {}'.format(cmd.get('command')) logger.error(msg) anon_ch.put({'status': 'error', 'message': msg}) # @TODO - # delete the anonymous channel from this thread but sleep first to avoid the race condition. time.sleep(1.5) logger.info( "deleting the anon_ch associated with the clientg message. anon_ch.name: {}" .format(anon_ch.name)) anon_ch.delete() logger.debug("anon_ch deleted.") # NOT doing this for now -- deleting entire anon channel instead (see above) # clean up the anon channel event queue. this is an issue with the # channelpy library # anon_ch._queue._event_queue.delete() def new_client(self, cmd, anon_ch): """Main function to process a `new` command message.""" valid, msg, owner = self.check_new_params(cmd) if valid: try: api_server, key, secret, access_token, refresh_token = self.generate_client( cmd, owner) except ClientException as e: logger.error("Error generating client: {}".format(e)) anon_ch.put({'status': 'error', 'message': str(e.msg)}) return None logger.debug("Client generated.") cl = Client( **{ 'tenant': cmd['tenant'], 'actor_id': cmd['actor_id'], 'worker_id': cmd['worker_id'], 'client_key': key, 'client_name': cmd['worker_id'], }) clients_store[cl.id] = cl logger.info("client generated and stored. client: {}".format(cl)) self.send_client(api_server, key, secret, access_token, refresh_token, anon_ch) else: m = 'Invalid command parameters: {}'.format(msg) logger.error(m) anon_ch.put({'status': 'error', 'message': m}) def generate_client(self, cmd, owner): """Generate an Agave OAuth client whose name is equal to the worker_id that will be using said client.""" logger.debug("top of generate_client(); cmd: {}; owner: {}".format( cmd, owner)) api_server, ag = self.get_agave(cmd['tenant'], actor_owner=owner) worker_id = cmd['worker_id'] logger.debug("Got agave object; now generating OAuth client.") try: ag.clients.create(body={'clientName': worker_id}) except Exception as e: msg = "clientg got exception trying to create OAuth client for worker {}; " \ "exception: {}; type(e): {}".format(worker_id, e, type(e)) logger.error(msg) # set the exception message depending on whether retry is possible: exception_msg = f"AgaveClientFailedCanRetry error for worker {worker_id}" if isinstance(e, AgaveClientFailedDoNotRetry): exception_msg = f"AgaveClientFailedDoNotRetry error for worker {worker_id}" logger.info(exception_msg) raise ClientException(exception_msg) # note - the client generates tokens representing the user who registered the actor logger.info("ag.clients.create successful.") return api_server,\ ag.api_key, \ ag.api_secret, \ ag.token.token_info['access_token'], \ ag.token.token_info['refresh_token'] def send_client(self, api_server, client_id, client_secret, access_token, refresh_token, anon_ch): """Send client credentials to a worker on an anonymous channel.""" logger.info( "sending client credentials for client: {} to channel: {}".format( client_id, anon_ch)) msg = { 'status': 'ok', 'api_server': api_server, 'client_id': client_id, 'client_secret': client_secret, 'access_token': access_token, 'refresh_token': refresh_token } anon_ch.put(msg) def check_common(self, cmd): """Common check for new and delete client requests.""" # validate the secret if not cmd.get('secret') == self.secret: m = 'Invalid secret.' logger.error(m) return False, m # validate tenant if not cmd.get('tenant') in get_tenants(): m = 'Invalid client passed: {}'.format(cmd.get('tenant')) logger.error(m) return False, m logger.debug("common params were valid.") return True, '' def check_new_params(self, cmd): """Additional checks for new client requests.""" valid, msg = self.check_common(cmd) # validate the actor_id try: actor = Actor.from_db(actors_store[cmd.get('actor_id')]) except KeyError: m = "Unable to look up actor with id: {}".format( cmd.get('actor_id')) logger.error(m) return False, m, None # validate the worker id try: Worker.get_worker(actor_id=cmd.get('actor_id'), worker_id=cmd.get('worker_id')) except WorkerException as e: m = "Unable to look up worker: {}".format(e.msg) logger.error(m) return False, m, None logger.debug("new params were valid.") owner_prefix = get_tenant_userstore_prefix(actor.tenant) logger.debug( f"using owner prefix: {owner_prefix} for tenant: {actor.tenant}") if owner_prefix: owner = f"{owner_prefix}/{actor.owner}" else: owner = actor.owner logger.debug(f"using owner: {owner}") return valid, msg, owner def check_del_params(self, cmd): """Additional checks for delete client requests.""" valid, msg = self.check_common(cmd) if not cmd.get('client_id'): m = 'client_id parameter required.' logger.error(m) return False, m, None # It's possible the actor record has been deleted so we need to remove the client based solely on # the information on the command. # also, agave owner doesn't matter on delete since we are only using the service account (basic auth). logger.debug("del params were valid.") return valid, msg, 'abaco_service' def delete_client(self, cmd, anon_ch): """Main function to process a `delete` command message.""" valid, msg, owner = self.check_del_params(cmd) if not valid: anon_ch.put({ 'status': 'error', 'message': 'Invalid parameters sent: {}'.format(msg) }) return None try: _, ag = self.get_agave(cmd['tenant'], owner) except ClientException as e: m = 'Could not generate an Agave client: {}'.format(e) logger.error(m) anon_ch.put({'status': 'error', 'message': m}) return None # remove the client from APIM try: ag.clients.delete(clientName=cmd['worker_id']) except Exception as e: m = 'Not able to delete client from APIM. Exception: {}'.format(e) logger.error(m) anon_ch.put({'status': 'error', 'message': m}) return None # remove the client from the abaco db try: Client.delete_client(tenant=cmd['tenant'], client_key=cmd['client_id']) except Exception as e: m = 'Not able to delete client from abaco db. Exception: {}'.format( e) logger.error(m) anon_ch.put({'status': 'error', 'message': m}) return None logger.info("client deleted successfully.") anon_ch.put({'status': 'ok', 'message': 'Client deleted.'})