class Spawner(object): def __init__(self): self.num_workers = int(Config.get('workers', 'init_count')) self.cmd_ch = CommandChannel() def run(self): while True: cmd = self.cmd_ch.get() self.process(cmd) def stop_workers(self, actor_id): """Stop existing workers; used when updating an actor's image.""" try: workers = json.loads(workers_store[actor_id]) print("Found existing workers: {}".format(str(workers))) except KeyError: print("No existing workers.") workers = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers) > 0 : # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() # now, send messages to workers for a graceful shutdown: for worker in workers: ch = WorkerChannel(name=worker['ch_name']) ch.put('stop') def process(self, cmd): print("Processing cmd:{}".format(str(cmd))) actor_id = cmd['actor_id'] image = cmd['image'] stop_existing = cmd.get('stop_existing', True) num_workers = cmd.get('num', self.num_workers) print("Actor id:{}".format(actor_id)) try: new_channels, anon_channels, new_workers = self.start_workers(actor_id, image, num_workers) except SpawnerException as e: # for now, start_workers will do clean up for a SpawnerException, so we just need # to return back to the run loop. return print("Created new workers: {}".format(str(new_workers))) # stop any existing workers: if stop_existing: self.stop_workers(actor_id) # tell new workers to subscribe to the actor channel. for channel in anon_channels: channel.put({'status': 'ok', 'actor_id': actor_id}) if not stop_existing: workers = json.loads(workers_store[actor_id]) workers.extend(new_workers) workers_store[actor_id] = json.dumps(workers) else: workers_store[actor_id] = json.dumps(new_workers) def start_workers(self, actor_id, image, num_workers): print("starting {} workers. actor_id: {} image: {}".format(str(self.num_workers), actor_id, image)) channels = [] anon_channels = [] workers = [] try: for i in range(num_workers): print("starting worker {}".format(str(i))) ch, anon_ch, worker = self.start_worker(image) print("channel for worker {} is: {}".format(str(i), ch._name)) channels.append(ch) anon_channels.append(anon_ch) workers.append(worker) except SpawnerException as e: print("Caught SpawnerException:{}".format(str(e))) # in case of an error, put the actor in error state and kill all workers Actor.set_status(actor_id, ERROR) for worker in workers: try: self.kill_worker(worker) except DockerError as e: print("Received DockerError trying to kill worker: {}".format(str(e))) raise SpawnerException() return channels, anon_channels, workers def start_worker(self, image): ch = WorkerChannel() # start an actor executor container and wait for a confirmation that image was pulled. worker = run_worker(image, ch._name) print("worker started successfully, waiting on ack that image was pulled...") result = ch.get() if result['value']['status'] == 'ok': print("received ack from worker.") return ch, result['reply_to'], worker else: print("Got an error status from worker: {}. Raising an exception.".format(str(result))) raise SpawnerException() def kill_worker(self, worker): pass
class Spawner(object): def __init__(self): self.num_workers = int(Config.get('workers', 'init_count')) self.secret = os.environ.get('_abaco_secret') self.cmd_ch = CommandChannel() def run(self): while True: cmd = self.cmd_ch.get() self.process(cmd) def stop_workers(self, actor_id): """Stop existing workers; used when updating an actor's image.""" try: workers_dict = workers_store[actor_id] except KeyError: workers_dict = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers_dict.items()) > 0: # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() # now, send messages to workers for a graceful shutdown: for _, worker in workers_dict.items(): ch = WorkerChannel(name=worker['ch_name']) ch.put('stop') def process(self, cmd): print("Processing cmd:{}".format(str(cmd))) actor_id = cmd['actor_id'] image = cmd['image'] tenant = cmd['tenant'] stop_existing = cmd.get('stop_existing', True) num_workers = cmd.get('num', self.num_workers) print("Actor id:{}".format(actor_id)) try: new_channels, anon_channels, new_workers = self.start_workers(actor_id, image, tenant, num_workers) except SpawnerException as e: # for now, start_workers will do clean up for a SpawnerException, so we just need # to return back to the run loop. return print("Created new workers: {}".format(str(new_workers))) # stop any existing workers: if stop_existing: self.stop_workers(actor_id) # add workers to store first so that the records will be there when the workers go # to update their status if not stop_existing: for _, worker in new_workers.items(): Worker.add_worker(actor_id, worker) else: workers_store[actor_id] = new_workers # send new workers their clients and tell them to subscribe to the actor channel. for idx, channel in enumerate(anon_channels): print("Getting client for worker {}".format(idx)) client_ch = ClientsChannel() client_msg = client_ch.request_client(tenant=tenant, actor_id=actor_id, # new_workers is a dictionary of dictionaries; list(d) creates a # list of keys for a dictionary d. hence, the idx^th entry # of list(ner_workers) should be the key. worker_id=new_workers[list(new_workers)[idx]]['ch_name'], secret=self.secret) # we need to ignore errors when generating clients because it's possible it is not set up for a specific # tenant. we log it instead. if client_msg.get('status') == 'error': print("Error generating client: {}".format(client_msg.get('message'))) channel.put({'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'no'}) # else, client was generated successfully: else: print("Got a client: {}, {}, {}".format(client_msg['client_id'], client_msg['access_token'], client_msg['refresh_token'])) channel.put({'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'yes', 'client_id': client_msg['client_id'], 'client_secret': client_msg['client_secret'], 'access_token': client_msg['access_token'], 'refresh_token': client_msg['refresh_token'], 'api_server': client_msg['api_server'], }) print("Done processing command.") def start_workers(self, actor_id, image, tenant, num_workers): print("starting {} workers. actor_id: {} image: {}".format(str(self.num_workers), actor_id, image)) channels = [] anon_channels = [] workers = {} try: for i in range(num_workers): print("starting worker {}".format(str(i))) ch, anon_ch, worker = self.start_worker(image, tenant) print("channel for worker {} is: {}".format(str(i), ch.name)) channels.append(ch) anon_channels.append(anon_ch) workers[worker['ch_name']] = worker except SpawnerException as e: print("Caught SpawnerException:{}".format(str(e))) # in case of an error, put the actor in error state and kill all workers Actor.set_status(actor_id, ERROR) for worker in workers: try: self.kill_worker(worker) except DockerError as e: print("Received DockerError trying to kill worker: {}".format(str(e))) raise SpawnerException() return channels, anon_channels, workers def start_worker(self, image, tenant): ch = WorkerChannel() # start an actor executor container and wait for a confirmation that image was pulled. worker_dict = run_worker(image, ch.name) worker = Worker(tenant=tenant, **worker_dict) print("worker started successfully, waiting on ack that image was pulled...") result = ch.get() if result['value']['status'] == 'ok': print("received ack from worker.") return ch, result['reply_to'], worker else: print("Got an error status from worker: {}. Raising an exception.".format(str(result))) raise SpawnerException() def kill_worker(self, worker): pass
class Spawner(object): def __init__(self): self.num_workers = int(Config.get('workers', 'init_count')) self.secret = os.environ.get('_abaco_secret') self.cmd_ch = CommandChannel() def run(self): while True: cmd = self.cmd_ch.get() self.process(cmd) def stop_workers(self, actor_id, worker_ids): """Stop existing workers; used when updating an actor's image.""" try: workers_dict = workers_store[actor_id] except KeyError: workers_dict = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers_dict.items()) > 0: # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() # now, send messages to workers for a graceful shutdown: for _, worker in workers_dict.items(): # don't stop the new workers: if worker['id'] not in worker_ids: ch = WorkerChannel(name=worker['ch_name']) ch.put('stop') def process(self, cmd): print("Processing cmd:{}".format(str(cmd))) actor_id = cmd['actor_id'] worker_ids = cmd['worker_ids'] image = cmd['image'] tenant = cmd['tenant'] stop_existing = cmd.get('stop_existing', True) num_workers = cmd.get('num', self.num_workers) print("Actor id:{}".format(actor_id)) try: new_channels, anon_channels, new_workers = self.start_workers( actor_id, worker_ids, image, tenant, num_workers) except SpawnerException as e: # for now, start_workers will do clean up for a SpawnerException, so we just need # to return back to the run loop. return print("Created new workers: {}".format(str(new_workers))) # stop any existing workers: if stop_existing: self.stop_workers(actor_id, worker_ids) # add workers to store first so that the records will be there when the workers go # to update their status if not stop_existing: for _, worker in new_workers.items(): Worker.add_worker(actor_id, worker) else: workers_store[actor_id] = new_workers # Tell new worker to subscribe to the actor channel. # If abaco is configured to generate clients for the workers, generate them now # and send new workers their clients. generate_clients = Config.get('workers', 'generate_clients').lower() for idx, channel in enumerate(anon_channels): if generate_clients == 'true': print("Getting client for worker {}".format(idx)) client_ch = ClientsChannel() client_msg = client_ch.request_client( tenant=tenant, actor_id=actor_id, # new_workers is a dictionary of dictionaries; list(d) creates a # list of keys for a dictionary d. hence, the idx^th entry # of list(ner_workers) should be the key. worker_id=new_workers[list(new_workers)[idx]]['id'], secret=self.secret) # we need to ignore errors when generating clients because it's possible it is not set up for a specific # tenant. we log it instead. if client_msg.get('status') == 'error': print("Error generating client: {}".format( client_msg.get('message'))) channel.put({ 'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'no' }) # else, client was generated successfully: else: print("Got a client: {}, {}, {}".format( client_msg['client_id'], client_msg['access_token'], client_msg['refresh_token'])) channel.put({ 'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'yes', 'client_id': client_msg['client_id'], 'client_secret': client_msg['client_secret'], 'access_token': client_msg['access_token'], 'refresh_token': client_msg['refresh_token'], 'api_server': client_msg['api_server'], }) else: print("Not generating clients. Config value was: {}".format( generate_clients)) channel.put({ 'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'no' }) print("Done processing command.") def start_workers(self, actor_id, worker_ids, image, tenant, num_workers): print("starting {} workers. actor_id: {} image: {}".format( str(self.num_workers), actor_id, image)) channels = [] anon_channels = [] workers = {} try: for i in range(num_workers): worker_id = worker_ids[i] print("starting worker {} with id: {}".format(i, worker_id)) ch, anon_ch, worker = self.start_worker( image, tenant, worker_id) print("channel for worker {} is: {}".format(str(i), ch.name)) channels.append(ch) anon_channels.append(anon_ch) workers[worker_id] = worker except SpawnerException as e: print("Caught SpawnerException:{}".format(str(e))) # in case of an error, put the actor in error state and kill all workers Actor.set_status(actor_id, ERROR, status_message=e.message) for worker in workers: try: self.kill_worker(worker) except DockerError as e: print("Received DockerError trying to kill worker: {}". format(str(e))) raise SpawnerException(message=e.message) return channels, anon_channels, workers def start_worker(self, image, tenant, worker_id): ch = WorkerChannel() # start an actor executor container and wait for a confirmation that image was pulled. worker_dict = run_worker(image, ch.name, worker_id) worker = Worker(tenant=tenant, **worker_dict) print( "worker started successfully, waiting on ack that image was pulled..." ) result = ch.get() if result.get('status') == 'error': # there was a problem pulling the image; put the actor in an error state: msg = "got an error back from the worker. Message: {}", format( result) print(msg) if 'msg' in result: raise SpawnerException(message=result['msg']) else: raise SpawnerException( message="Internal error starting worker process.") elif result['value']['status'] == 'ok': print("received ack from worker.") return ch, result['reply_to'], worker else: msg = "Got an error status from worker: {}. Raising an exception.".format( str(result)) print(msg) raise SpawnerException(msg) def kill_worker(self, worker): pass
class Spawner(object): def __init__(self): self.num_workers = int(Config.get('workers', 'init_count')) self.secret = os.environ.get('_abaco_secret') self.cmd_ch = CommandChannel() def run(self): while True: cmd = self.cmd_ch.get() self.process(cmd) def stop_workers(self, actor_id, worker_ids): """Stop existing workers; used when updating an actor's image.""" logger.debug("Top of stop_workers() for actor: {}.".format(actor_id)) try: workers_dict = workers_store[actor_id] except KeyError: logger.debug( "workers_store had no workers for actor: {}".format(actor_id)) workers_dict = {} # if there are existing workers, we need to close the actor message channel and # gracefully shutdown the existing worker processes. if len(workers_dict.items()) > 0: logger.info("Found {} workers to stop.".format( len(workers_dict.items()))) # first, close the actor msg channel to prevent any new messages from being pulled # by the old workers. actor_ch = ActorMsgChannel(actor_id) actor_ch.close() logger.info("Actor channel closed for actor: {}".format(actor_id)) # now, send messages to workers for a graceful shutdown: for _, worker in workers_dict.items(): # don't stop the new workers: if worker['id'] not in worker_ids: ch = WorkerChannel(worker_id=worker['id']) ch.put('stop') logger.info( "Sent 'stop' message to worker channel: {}".format(ch)) ch.close() else: logger.info("No workers to stop.") def process(self, cmd): """Main spawner method for processing a command from the CommandChannel.""" logger.info("Spawner processing new command:{}".format(cmd)) actor_id = cmd['actor_id'] worker_ids = cmd['worker_ids'] image = cmd['image'] tenant = cmd['tenant'] stop_existing = cmd.get('stop_existing', True) num_workers = cmd.get('num', self.num_workers) logger.info( "command params: actor_id: {} worker_ids: {} image: {} stop_existing: {} mum_workers: {}" .format(actor_id, worker_ids, image, tenant, stop_existing, num_workers)) try: new_channels, anon_channels, new_workers = self.start_workers( actor_id, worker_ids, image, tenant, num_workers) except SpawnerException as e: # for now, start_workers will do clean up for a SpawnerException, so we just need # to return back to the run loop. logger.info("Spawner returning to main run loop.") return logger.info("Created new workers: {}".format(new_workers)) # stop any existing workers: if stop_existing: logger.info("Stopping existing workers: {}".format(worker_ids)) self.stop_workers(actor_id, worker_ids) # add workers to store first so that the records will be there when the workers go # to update their status if not stop_existing: # if we're not stopping the existing workers, we need to add each worker to the # actor's collection. for _, worker in new_workers.items(): logger.info( "calling add_worker for worker: {}.".format(worker)) Worker.add_worker(actor_id, worker) else: # since we're stopping the existing workers, the actor's collection should just # be equal to the new_workers. workers_store[actor_id] = new_workers logger.info( "workers_store set to new_workers: {}.".format(new_workers)) # Tell new worker to subscribe to the actor channel. # If abaco is configured to generate clients for the workers, generate them now # and send new workers their clients. generate_clients = Config.get('workers', 'generate_clients').lower() logger.info( "Sending messages to new workers over anonymous channels to subscribe to inbox." ) for idx, channel in enumerate(anon_channels): if generate_clients == 'true': logger.info("Getting client for worker {}".format(idx)) client_ch = ClientsChannel() try: client_msg = client_ch.request_client( tenant=tenant, actor_id=actor_id, # new_workers is a dictionary of dictionaries; list(d) creates a # list of keys for a dictionary d. hence, the idx^th entry # of list(ner_workers) should be the key. worker_id=new_workers[list(new_workers)[idx]]['id'], secret=self.secret) except ChannelTimeoutException as e: logger.error( "Got a ChannelTimeoutException trying to generate a client: {}" .format(e)) # put actor in an error state and return self.error_out_actor(actor_id, [], str(e)) client_ch.close() return client_ch.close() # we need to ignore errors when generating clients because it's possible it is not set up for a specific # tenant. we log it instead. if client_msg.get('status') == 'error': logger.info("Error generating client: {}".format( client_msg.get('message'))) channel.put({ 'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'no' }) logger.debug( "Sent OK message over anonymous worker channel.") # else, client was generated successfully: else: logger.info("Got a client: {}, {}, {}".format( client_msg['client_id'], client_msg['access_token'], client_msg['refresh_token'])) channel.put({ 'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'yes', 'client_id': client_msg['client_id'], 'client_secret': client_msg['client_secret'], 'access_token': client_msg['access_token'], 'refresh_token': client_msg['refresh_token'], 'api_server': client_msg['api_server'], }) logger.debug( "Sent OK message AND client over anonymous worker channel." ) else: logger.info( "Not generating clients. Config value was: {}".format( generate_clients)) channel.put({ 'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'no' }) logger.debug("Sent OK message over anonymous worker channel.") # @TODO - # delete the anonymous channel from this thread but sleep first to avoid the race condition. time.sleep(1.5) channel.delete() # due to the race condition deleting channels (potentially before all workers have received all messages) # we put a sleep here. time.sleep(1) for ch in new_channels: try: # the new_channels are the spawnerworker channels so they can be deleted. ch.delete() except Exception as e: logger.error( "Got exception trying to delete spawnerworker channel: {}". format(e)) logger.info("Done processing command.") def start_workers(self, actor_id, worker_ids, image, tenant, num_workers): logger.info("starting {} workers. actor_id: {} image: {}".format( str(self.num_workers), actor_id, image)) channels = [] anon_channels = [] workers = {} try: for i in range(num_workers): worker_id = worker_ids[i] logger.info("starting worker {} with id: {}".format( i, worker_id)) ch, anon_ch, worker = self.start_worker( image, tenant, worker_id) logger.debug("channel for worker {} is: {}".format( str(i), ch.name)) channels.append(ch) anon_channels.append(anon_ch) workers[worker_id] = worker except SpawnerException as e: logger.info("Caught SpawnerException:{}".format(str(e))) # in case of an error, put the actor in error state and kill all workers self.error_out_actor(actor_id, workers, e.message) raise SpawnerException(message=e.message) return channels, anon_channels, workers def start_worker(self, image, tenant, worker_id): ch = SpawnerWorkerChannel(worker_id=worker_id) # start an actor executor container and wait for a confirmation that image was pulled. worker_dict = run_worker(image, worker_id) worker_dict['ch_name'] = WorkerChannel.get_name(worker_id) worker = Worker(tenant=tenant, **worker_dict) logger.info( "worker started successfully, waiting on ack that image was pulled..." ) result = ch.get() logger.debug( "Got response back from worker. Response: {}".format(result)) if result.get('status') == 'error': # there was a problem pulling the image; put the actor in an error state: msg = "Got an error back from the worker. Message: {}", format( result) logger.info(msg) if 'msg' in result: raise SpawnerException(message=result['msg']) else: logger.error( "Spawner received invalid message from worker. 'msg' field missing. Message: {}" .format(result)) raise SpawnerException( message="Internal error starting worker process.") elif result['value']['status'] == 'ok': logger.debug("received ack from worker.") return ch, result['reply_to'], worker else: msg = "Got an error status from worker: {}. Raising an exception.".format( str(result)) logger.error( "Spawner received an invalid message from worker. Message: ". format(result)) raise SpawnerException(msg) def error_out_actor(self, actor_id, workers, message): """In case of an error, put the actor in error state and kill all workers""" Actor.set_status(actor_id, ERROR, status_message=message) for worker in workers: try: self.kill_worker(worker) except DockerError as e: logger.info( "Received DockerError trying to kill worker: {}. Exception: {}" .format(worker, e)) logger.info( "Spawner will continue on since this is exception processing." ) def kill_worker(self, worker): pass