def post(self, actor_id): """Ensure a certain number of workers are running for an actor""" id = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[id]) except KeyError: raise ResourceError("actor not found: {}'".format(actor_id), 404) args = self.validate_post() num = args.get('num') if not num or num == 0: num = 1 dbid = Actor.get_dbid(g.tenant, actor_id) workers = Worker.get_workers(dbid) if len(workers.items()) < num: worker_ids = [] num_to_add = int(num) - len(workers.items()) for idx in range(num_to_add): worker_ids.append(Worker.request_worker(actor_id)) ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=g.tenant, num=num_to_add, stop_existing=False) return ok( result=None, msg="Scheduled {} new worker(s) to start. There were only". format(num_to_add)) else: return ok(result=None, msg="Actor {} already had {} worker(s).".format( actor_id, num))
def get(self, actor_id): logger.debug("top of GET /actors/{}/workers for tenant {}.".format( actor_id, g.tenant)) dbid = Actor.get_dbid(g.tenant, actor_id) try: Actor.from_db(actors_store[dbid]) except KeyError: logger.debug("did not find actor: {}.".format(actor_id)) raise ResourceError("No actor found with id: {}.".format(actor_id), 404) try: workers = Worker.get_workers(dbid) except WorkerException as e: logger.debug( "did not find workers for actor: {}.".format(actor_id)) raise ResourceError(e.msg, 404) result = [] for id, worker in workers.items(): worker.update({'id': id}) try: w = Worker(**worker) result.append(w.display()) except Exception as e: logger.error( "Unable to instantiate worker in workers endpoint from description: {}. " .format(worker)) return ok(result=result, msg="Workers retrieved successfully.")
async def jobs_dispatcher(): if Worker.select().count() == 0: for i in range(1): Worker.create(state="available") workers = Worker.select().where(Worker.state == "available") # no available workers, wait if workers.count() == 0: return with db.atomic('IMMEDIATE'): jobs = Job.select().where(Job.state == "scheduled") # no jobs to process, wait if jobs.count() == 0: await asyncio.sleep(3) return for i in range(min(workers.count(), jobs.count())): job = jobs[i] worker = workers[i] job.state = "running" job.started_time = datetime.now() job.save() worker.state = "busy" worker.save() jobs_in_memory_state[job.id] = { "worker": worker.id, "task": asyncio.ensure_future(run_job(worker, job)), }
def check_new_params(self, cmd): """Additional checks for new client requests.""" valid, msg = self.check_common(cmd) # validate the actor_id try: actor = Actor.from_db(actors_store[cmd.get('actor_id')]) except KeyError: m = "Unable to look up actor with id: {}".format( cmd.get('actor_id')) logger.error(m) return False, m, None # validate the worker id try: Worker.get_worker(actor_id=cmd.get('actor_id'), worker_id=cmd.get('worker_id')) except WorkerException as e: m = "Unable to look up worker: {}".format(e.msg) logger.error(m) return False, m, None logger.debug("new params were valid.") owner_prefix = get_tenant_userstore_prefix(actor.tenant) logger.debug( f"using owner prefix: {owner_prefix} for tenant: {actor.tenant}") if owner_prefix: owner = f"{owner_prefix}/{actor.owner}" else: owner = actor.owner logger.debug(f"using owner: {owner}") return valid, msg, owner
def post(self, actor_id): """Ensure a certain number of workers are running for an actor""" logger.debug("top of POST /actors/{}/workers.".format(actor_id)) id = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[id]) except KeyError: logger.debug("did not find actor: {}.".format(actor_id)) raise ResourceError("No actor found with id: {}.".format(actor_id), 404) args = self.validate_post() logger.debug( "workers POST params validated. actor: {}.".format(actor_id)) num = args.get('num') if not num or num == 0: logger.debug("did not get a num: {}.".format(actor_id)) num = 1 logger.debug("ensuring at least {} workers. actor: {}.".format( num, actor_id)) dbid = Actor.get_dbid(g.tenant, actor_id) try: workers = Worker.get_workers(dbid) except WorkerException as e: logger.debug( "did not find workers for actor: {}.".format(actor_id)) raise ResourceError(e.msg, 404) current_number_workers = len(workers.items()) if current_number_workers < num: logger.debug( "There were only {} workers for actor: {} so we're adding more." .format(current_number_workers, actor_id)) worker_ids = [] num_to_add = int(num) - len(workers.items()) logger.info("adding {} more workers for actor {}".format( num_to_add, actor_id)) for idx in range(num_to_add): worker_ids.append( Worker.request_worker(tenant=g.tenant, actor_id=actor_id)) logger.info("New worker ids: {}".format(worker_ids)) ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=g.tenant, num=num_to_add, stop_existing=False) ch.close() logger.info( "Message put on command channel for new worker ids: {}".format( worker_ids)) return ok( result=None, msg="Scheduled {} new worker(s) to start. There were only". format(num_to_add)) else: return ok(result=None, msg="Actor {} already had {} worker(s).".format( actor_id, num))
def process_worker_ch(tenant, worker_ch, actor_id, worker_id, actor_ch, ag_client): """ Target for a thread to listen on the worker channel for a message to stop processing. :param worker_ch: :return: """ global keep_running logger.info("Worker subscribing to worker channel...") while True: try: msg = worker_ch.get(timeout=2) except channelpy.ChannelTimeoutException: continue logger.debug("Received message in worker channel: {}".format(msg)) logger.debug("Type(msg)={}".format(type(msg))) if type(msg) == dict: value = msg.get('value', '') if value == 'status': # this is a health check, return 'ok' to the reply_to channel. logger.debug("received health check. returning 'ok'.") ch = msg['reply_to'] ch.put('ok') elif msg == 'stop': logger.info("Received stop message, stopping worker...") # first, delete an associated client # its possible this worker was not passed a client, # but if so, we need to delete it before shutting down. if ag_client: logger.info("Requesting client {} be deleted.".format( ag_client.api_key)) secret = os.environ.get('_abaco_secret') clients_ch = ClientsChannel() msg = clients_ch.request_delete_client( tenant=tenant, actor_id=actor_id, worker_id=worker_id, client_id=ag_client.api_key, secret=secret) if msg['status'] == 'ok': logger.info("Delete request completed successfully.") else: logger.error("Error deleting client. Message: {}".format( msg['message'])) else: logger.info( "Did not receive client. Not issuing delete. Exiting.") try: Worker.delete_worker(actor_id, worker_id) except WorkerException as e: logger.info( "Got WorkerException from delete_worker(). Exception: {}". format(e)) keep_running = False actor_ch.close() logger.info("Closing actor channel for actor: {}".format(actor_id)) logger.info("Worker is now exiting.") sys.exit()
def create_worker(username, password, role,): user = User.objects.create_user(username, password=None) user.set_password(password) user.save() worker = Worker(role=role, user=user) worker.save() return worker.id
def test_boss_workers_sequence(self): worker1 = Worker(1, 'w1', '') worker2 = Worker(1, 'w2', '') worker3 = Worker(1, 'w3', '') _workers = [worker1, worker3, worker2] for worker in _workers: self.boss.add_worker(worker) self.assertTrue(self.boss.workers == _workers)
def kill_worker(self, actor_id, worker_id): try: Worker.delete_worker(actor_id, worker_id) except WorkerException as e: logger.info("Got WorkerException from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e)) except Exception as e: logger.error("Got an unexpected exception from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e))
def kill_worker(self, actor_id, worker_id): logger.debug(f"top of kill_worker: {actor_id}_{worker_id}") try: Worker.delete_worker(actor_id, worker_id) logger.debug(f"worker deleted; {actor_id}_{worker_id}") except WorkerException as e: logger.info("Got WorkerException from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e)) except Exception as e: logger.error("Got an unexpected exception from delete_worker(). " "worker_id: {}" "Exception: {}".format(worker_id, e))
def check_new_params(self, cmd): valid, msg = self.check_common(cmd) # validate the actor_id try: actor = Actor.from_db(actors_store[cmd.get('actor_id')]) except KeyError: return False, "Unable to look up actor with id: {}".format(cmd.get('actor_id')), None # validate the worker id try: Worker.get_worker(actor_id=cmd.get('actor_id'), worker_id=cmd.get('worker_id')) except WorkerException as e: return False, "Unable to look up worker: {}".format(e.msg), None return valid, msg, actor.owner
def check_new_params(self, cmd): valid, msg = self.check_common(cmd) # validate the actor_id try: actor = Actor.from_db(actors_store[cmd.get('actor_id')]) except KeyError: return False, "Unable to look up actor with id: {}".format(cmd.get('actor_id')), None # validate the worker id try: Worker.get_worker(actor_id=cmd.get('actor_id'), ch_name=cmd.get('worker_id')) except WorkerException as e: return False, "Unable to look up worker: {}".format(e.msg), None return valid, msg, actor.owner
def handle_signup_worker(): """ Registra un trabajador para incluirlo en la DB""" data = request.get_json() if data is None: raise APIException( "You need to specify the request body as a json object", status_code=400) new_worker = Worker( init_date=data['init_date'], Consultor=data['Consultor'], candidate=data['candidate'], cedula=data['cedula'], status=data['status'], phone_number=data['phone_number'], email=data['email'], catchment_source=data['catchment_source'], managment=data['managment'], vacant=data['vacant'], Observations=data['Observations'], interview_date=data['interview_date'], actual_charge=data['actual_charge'], company=data['company'], sector=data['sector'], coin=data['coin'], basic_salary=data['basic_salary'], variable_salary=data['variable_salary'], cesta_ticket=data['cesta_ticket'], Profit_Days=data['Profit_Days'], vacations=data['vacations'], Vacation_Bonus=data['Vacation_Bonus'], Factor=data['Factor'], Estimated_annual_package=data['Estimated_annual_package'], Production_bonus=data['Production_bonus'], Transport_bonus=data['Transport_bonus'], Savings_Bank=data['Savings_Bank'], food_bags=data['food_bags'], parking_payment=data['parking_payment'], partial_HCM_Emp_Family=data['partial_HCM_Emp_Family'], Vehicle_insurance=data['Vehicle_insurance'], life_insurance=data['life_insurance'], dinning_room=data['dinning_room'], full_HCM_Emp_Family=data['full_HCM_Emp_Family']) db.session.add(new_worker) db.session.commit() if new_worker: return new_worker.serialize(), 201
def process_worker_ch(tenant, worker_ch, actor_id, actor_ch, ag_client): """ Target for a thread to listen on the worker channel for a message to stop processing. :param worker_ch: :return: """ global keep_running print("Worker subscribing to worker channel...") while True: try: msg = worker_ch.get(timeout=2) except channelpy.ChannelTimeoutException: continue print("Received message in worker channel: {}".format(msg)) print("Type(msg)={}".format(type(msg))) if type(msg) == dict: value = msg.get('value', '') if value == 'status': # this is a health check, return 'ok' to the reply_to channel. ch = msg['reply_to'] ch.put('ok') elif msg == 'stop': print("Received stop message, stopping worker...") # first, delete an associated client # its possible this worker was not passed a client, # but if so, we need to delete it before shutting down. if ag_client: print("Requesting client {} be deleted.".format(ag_client.api_key)) secret = os.environ.get('_abaco_secret') clients_ch = ClientsChannel() msg = clients_ch.request_delete_client(tenant=tenant, actor_id=actor_id, worker_id=worker_ch.name, client_id=ag_client.api_key, secret=secret) if msg['status'] == 'ok': print("Delete request completed successfully.") else: print("Error deleting client. Message: {}".format(msg['message'])) else: print("Did not receive client. Not issuing delete. Exiting.") try: Worker.delete_worker(actor_id, worker_ch.name) except WorkerException: pass keep_running = False actor_ch.close() sys.exit()
def forget_password(): email = request.json.get('email', None) if not email or email == '': return None worker = Worker() worker.email = email worker = Worker.query.filter_by(email=email).first() if not worker: return jsonify({"msg": "This email is not registered"}), 404 token = generate_confirmation_token(worker.email) print(token) confirm_url = 'http://localhost:3000/confirmation/' + token html = render_template('email_confirmation.html', confirm_url=confirm_url) subject = "Por favor, Confirmar su email." sendMail("Por favor, Confirmar su email.", '', '', worker.email, html) return jsonify({"success": "Email send successfully"}), 200
def scale_down(actor_id): workers = Worker.get_workers(actor_id) logger.debug('METRICS NUMBER OF WORKERS: {}'.format(len(workers))) try: # if len(workers) == 1: # logger.debug("METRICS only one worker, won't scale down") # else: while len(workers) > 0: logger.debug('METRICS made it STATUS check') worker = workers.popitem()[1] logger.debug('METRICS SCALE DOWN current worker: {}'.format( worker['status'])) # check status of the worker is ready if worker['status'] == 'READY': # scale down try: shutdown_worker(worker['id'], delete_actor_ch=False) continue except Exception as e: logger.debug( 'METRICS ERROR shutting down worker: {} - {} - {}'. format(type(e), e, e.args)) logger.debug('METRICS shut down worker {}'.format( worker['id'])) except IndexError: logger.debug('METRICS only one worker found for actor {}. ' 'Will not scale down'.format(actor_id)) except Exception as e: logger.debug("METRICS SCALE UP FAILED: {}".format(e))
def start_worker(self, image, tenant, worker_id): ch = SpawnerWorkerChannel(worker_id=worker_id) # start an actor executor container and wait for a confirmation that image was pulled. worker_dict = run_worker(image, worker_id) worker_dict['ch_name'] = WorkerChannel.get_name(worker_id) worker = Worker(tenant=tenant, **worker_dict) logger.info( "worker started successfully, waiting on ack that image was pulled..." ) result = ch.get() logger.debug( "Got response back from worker. Response: {}".format(result)) if result.get('status') == 'error': # there was a problem pulling the image; put the actor in an error state: msg = "Got an error back from the worker. Message: {}", format( result) logger.info(msg) if 'msg' in result: raise SpawnerException(message=result['msg']) else: logger.error( "Spawner received invalid message from worker. 'msg' field missing. Message: {}" .format(result)) raise SpawnerException( message="Internal error starting worker process.") elif result['value']['status'] == 'ok': logger.debug("received ack from worker.") return ch, result['reply_to'], worker else: msg = "Got an error status from worker: {}. Raising an exception.".format( str(result)) logger.error( "Spawner received an invalid message from worker. Message: ". format(result)) raise SpawnerException(msg)
def put(self, actor_id): logger.debug("top of PUT /actors/{}".format(actor_id)) dbid = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[dbid]) except KeyError: logger.debug("did not find actor {} in store.".format(dbid)) raise ResourceError( "No actor found with id: {}.".format(actor_id), 404) previous_image = actor.image args = self.validate_put(actor) logger.debug("PUT args validated successfully.") args['tenant'] = g.tenant update_image = False if args['image'] == previous_image: logger.debug("new image is the same. not updating actor.") args['status'] = actor.status else: update_image = True args['status'] = SUBMITTED logger.debug("new image is different. updating actor.") args['api_server'] = g.api_server args['owner'] = g.user actor = Actor(**args) actors_store[actor.db_id] = actor.to_db() logger.info("updated actor {} stored in db.".format(actor_id)) worker_ids = Worker.request_worker(actor.db_id) if update_image: ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=args['tenant']) logger.debug("put new command on command channel to update actor.") return ok(result=actor.display(), msg="Actor updated successfully.")
def start_worker(self, image, tenant, worker_id): ch = WorkerChannel() # start an actor executor container and wait for a confirmation that image was pulled. worker_dict = run_worker(image, ch.name, worker_id) worker = Worker(tenant=tenant, **worker_dict) print( "worker started successfully, waiting on ack that image was pulled..." ) result = ch.get() if result.get('status') == 'error': # there was a problem pulling the image; put the actor in an error state: msg = "got an error back from the worker. Message: {}", format( result) print(msg) if 'msg' in result: raise SpawnerException(message=result['msg']) else: raise SpawnerException( message="Internal error starting worker process.") elif result['value']['status'] == 'ok': print("received ack from worker.") return ch, result['reply_to'], worker else: msg = "Got an error status from worker: {}. Raising an exception.".format( str(result)) print(msg) raise SpawnerException(msg)
def put(self, actor_id): dbid = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[dbid]) except KeyError: raise ResourceError("actor not found: {}'".format(actor_id), 404) previous_image = actor.image args = self.validate_put(actor) args['tenant'] = g.tenant update_image = False if args['image'] == previous_image: args['status'] = actor.status else: update_image = True args['status'] = SUBMITTED args['api_server'] = g.api_server args['owner'] = g.user actor = Actor(**args) actors_store[actor.db_id] = actor.to_db() worker_ids = Worker.request_worker(actor.db_id) if update_image: ch = CommandChannel() ch.put_cmd(actor_id=actor.db_id, worker_ids=worker_ids, image=actor.image, tenant=args['tenant']) # return ok(result={'update_image': str(update_image)}, # msg="Actor updated successfully.") return ok(result=actor.display(), msg="Actor updated successfully.")
def add_worker(): try: id = int(request.form.get('worker_id')) name = request.form.get('worker_name') id_card = request.form.get('id_card_no') age = int(request.form.get('age')) sex = request.form.get('sex') tel = request.form.get('telephone') address = request.form.get('address') salary = int(request.form.get('salary')) type = request.form.get('worker_type') user_id = session['user_id'] agent = Worker.query.filter(Worker.worker_id == user_id).first() workshop_id = agent.workshop_id worker = Worker(worker_id=id, name=name, id_card_NO=id_card, age=age, sex=sex, contart_info=tel, address=address, salary=salary, work_type=type, workshop_id=workshop_id) db.session.add(worker) db.session.commit() return render_template('agent/agent_add.html', success=True, msg="添加成功~") except: db.session.rollback() return render_template('agent/agent_add.html', success=False, msg="添加失败,请确保输入信息无误~")
def scale_up(actor_id): tenant, aid = actor_id.decode('utf8').split('_') logger.debug( 'METRICS Attempting to create a new worker for {}'.format(actor_id)) try: # create a worker & add to this actor actor = Actor.from_db(actors_store[actor_id]) worker_id = Worker.request_worker(tenant=tenant, actor_id=actor_id) logger.info("New worker id: {}".format(worker_id)) if actor.queue: channel_name = actor.queue else: channel_name = 'default' ch = CommandChannel(name=channel_name) ch.put_cmd(actor_id=actor.db_id, worker_id=worker_id, image=actor.image, tenant=tenant, stop_existing=False) ch.close() logger.debug( 'METRICS Added worker successfully for {}'.format(actor_id)) return channel_name except Exception as e: logger.debug("METRICS - SOMETHING BROKE: {} - {} - {}".format( type(e), e, e.args)) return None
def error_out_actor(self, actor_id, worker_id, message): """In case of an error, put the actor in error state and kill all workers""" logger.debug("top of error_out_actor for worker: {}_{}".format( actor_id, worker_id)) # it is possible the actor was deleted already -- only set the actor status to ERROR if # it still exists in the store actor = actors_store.get(actor_id) if actor: Actor.set_status(actor_id, ERROR, status_message=message) # check to see how far the spawner got setting up the worker: try: worker = Worker.get_worker(actor_id, worker_id) worker_status = worker.get('status') logger.debug( f"got worker status for {actor_id}_{worker_id}; status: {worker_status}" ) except Exception as e: logger.debug( f"got exception in error_out_actor trying to determine worker status for {actor_id}_{worker_id}; " f"e:{e};") # skip all worker processing is skipped. return if worker_status == UPDATING_STORE or worker_status == READY or worker_status == BUSY: logger.debug( f"worker status was: {worker_status}; trying to stop_worker") # for workers whose containers are running, we first try to stop workers using the "graceful" approach - try: self.stop_workers(actor_id, worker_ids=[]) logger.info( "Spawner just stopped worker {}_{} in error_out_actor". format(actor_id, worker_id)) return except Exception as e: logger.error( "spawner got exception trying to run stop_workers. Exception: {}" .format(e)) logger.info( "setting worker_status to ERROR so that kill_worker will run." ) worker_status = ERROR # if the spawner was never able to start the worker container, we need to simply delete the worker record if worker_status == REQUESTED or worker_status == SPAWNER_SETUP or worker_status == PULLING_IMAGE or \ worker_status == ERROR: logger.debug( f"worker status was: {worker_status}; trying to kill_worker") try: self.kill_worker(actor_id, worker_id) logger.info( "Spawner just killed worker {}_{} in error_out_actor". format(actor_id, worker_id)) except DockerError as e: logger.info( "Received DockerError trying to kill worker: {}. Exception: {}" .format(worker_id, e)) logger.info( "Spawner will continue on since this is exception processing." )
def delete(self, actor_id, worker_id): id = Actor.get_dbid(g.tenant, actor_id) try: worker = Worker.get_worker(id, worker_id) except WorkerException as e: raise ResourceError(e.msg, 404) shutdown_worker(worker['ch_name']) return ok(result=None, msg="Worker scheduled to be stopped.")
def make_new_worker(amz_dict, session): worker = Worker(workerid=amz_dict['workerId'], assignmentid=amz_dict['assignmentId'], hitid=amz_dict['hitId'], list_id=amz_dict['list']) session.add(worker) session.commit() return worker
def delete(self, actor_id, ch_name): id = Actor.get_dbid(g.tenant, actor_id) try: worker = Worker.get_worker(id, ch_name) except WorkerException as e: raise APIException(e.msg, 404) shutdown_worker(ch_name) return ok(result=None, msg="Worker scheduled to be stopped.")
def shutdown_workers(actor_id): """Graceful shutdown of all workers for an actor. Pass db_id as the `actor_id` argument.""" workers = Worker.get_workers(actor_id) # @TODO - this code is not thread safe. we need to update the workers state in a transaction: for _, worker in workers.items(): # if there is already a channel, tell the worker to shut itself down if 'ch_name' in worker: shutdown_worker(worker['ch_name']) else: # otherwise, just remove from db: try: worker_id = worker.get('id') Worker.delete_worker(actor_id, worker_id) except WorkerException as e: print( "Got a WorkerException trying to delete worker with id: {}; exception: {}" .format(worker_id, e))
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" print("Checking health for actors: {}".format(actor_id)) workers = Worker.get_workers(actor_id) print("workers: {}".format(workers)) for _, worker in workers.items(): # if the worker has only been requested, it will not have a host_id. if 'host_id' not in worker: # @todo- we will skip for now, but we need something more robust in case the worker is never claimed. continue # ignore workers on different hosts if not Config.get('spawner', 'host_id') == worker['host_id']: continue # first check if worker is responsive; if not, will need to manually kill print("Checking health for worker: {}".format(worker)) ch = WorkerChannel(name=worker['ch_name']) try: print("Issuing status check to channel: {}".format(worker['ch_name'])) result = ch.put_sync('status', timeout=5) except channelpy.exceptions.ChannelTimeoutException: print("Worker did not respond, removing container and deleting worker.") try: rm_container(worker['cid']) except DockerError: pass Worker.delete_worker(actor_id, worker['ch_name']) continue if not result == 'ok': print("Worker responded unexpectedly: {}, deleting worker.".format(result)) rm_container(worker['cid']) Worker.delete_worker(actor_id, worker['ch_name']) else: print("Worker ok.") # now check if the worker has been idle beyond the ttl: if ttl < 0: # ttl < 0 means infinite life print("Infinite ttl configured; leaving worker") return if worker['status'] == codes.READY and \ worker['last_execution'] + ttl < time.time(): # shutdown worker print("Shutting down worker beyond ttl.") shutdown_worker(worker['ch_name']) else: print("Worker still has life.")
def manage_workers(actor_id): """Scale workers for an actor if based on message queue size and policy.""" print("Entering manage_workers for {}".format(actor_id)) try: actor = Actor.from_db(actors_store[actor_id]) except KeyError: print("Did not find actor; returning.") return workers = Worker.get_workers(actor_id)
def shutdown_worker(actor_id, worker_id, delete_actor_ch=True): """Gracefully shutdown a single worker." actor_id (str) - the dbid of the associated actor. """ logger.debug("top of shutdown_worker for worker_id: {}".format(worker_id)) # set the worker status to SHUTDOWN_REQUESTED: try: Worker.update_worker_status(actor_id, worker_id, SHUTDOWN_REQUESTED) except Exception as e: logger.error(f"worker got exception trying to update status to SHUTODWN_REQUESTED. actor_id: {actor_id};" f"worker_id: {worker_id}; exception: {e}") ch = WorkerChannel(worker_id=worker_id) if not delete_actor_ch: ch.put("stop-no-delete") else: ch.put("stop") logger.info("A 'stop' message was sent to worker: {}".format(worker_id)) ch.close()
def render(self): print("===Add===") if not self._tempWorker: name = input("\nName: ") age = input("Age: ") self._tempWorker = Worker(name, age, "") print("\nAdd {0}? (y/n)".format(self._tempWorker.name))
def get(self, actor_id, worker_id): logger.debug("top of GET /actors/{}/workers/{}.".format(actor_id, worker_id)) id = Actor.get_dbid(g.tenant, actor_id) try: Actor.from_db(actors_store[id]) except KeyError: logger.debug("Did not find actor: {}.".format(actor_id)) raise ResourceError("No actor found with id: {}.".format(actor_id), 404) try: worker = Worker.get_worker(id, worker_id) except WorkerException as e: logger.debug("Did not find worker: {}. actor: {}.".format(worker_id, actor_id)) raise ResourceError(e.msg, 404) # worker is an honest python dictionary with a single key, the id of the worker. need to # convert it to a Worker object worker.update({'id': worker_id}) w = Worker(**worker) return ok(result=w.display(), msg="Worker retrieved successfully.")
def manage_workers(actor_id): """Scale workers for an actor if based on message queue size and policy.""" logger.info("Entering manage_workers for {}".format(actor_id)) try: actor = Actor.from_db(actors_store[actor_id]) except KeyError: logger.info("Did not find actor; returning.") return workers = Worker.get_workers(actor_id)
def get(self, actor_id, worker_id): id = Actor.get_dbid(g.tenant, actor_id) try: Actor.from_db(actors_store[id]) except KeyError: raise WorkerException("actor not found: {}'".format(actor_id)) try: worker = Worker.get_worker(id, worker_id) except WorkerException as e: raise ResourceError(e.msg, 404) return ok(result=worker, msg="Worker retrieved successfully.")
def get(self, actor_id, ch_name): id = Actor.get_dbid(g.tenant, actor_id) try: Actor.from_db(actors_store[id]) except KeyError: raise WorkerException("actor not found: {}'".format(actor_id)) try: worker = Worker.get_worker(id, ch_name) except WorkerException as e: raise APIException(e.msg, 404) return ok(result=worker, msg="Worker retrieved successfully.")
def check_workers(actor_id, ttl): """Check health of all workers for an actor.""" print("Checking health for actors: {}".format(actor_id)) workers = Worker.get_workers(actor_id) print("workers: {}".format(workers)) for _, worker in workers.items(): # ignore workers on different hosts if not Config.get('spawner', 'host_id') == worker['host_id']: continue # first check if worker is responsive; if not, will need to manually kill print("Checking health for worker: {}".format(worker)) ch = WorkerChannel(name=worker['ch_name']) try: print("Issuing status check to channel: {}".format(worker['ch_name'])) result = ch.put_sync('status', timeout=5) except channelpy.exceptions.ChannelTimeoutException: print("Worker did not respond, removing container and deleting worker.") try: rm_container(worker['cid']) except DockerError: pass Worker.delete_worker(actor_id, worker['ch_name']) continue if not result == 'ok': print("Worker responded unexpectedly: {}, deleting worker.".format(result)) rm_container(worker['cid']) Worker.delete_worker(actor_id, worker['ch_name']) else: print("Worker ok.") # now check if the worker has been idle beyond the ttl: if ttl < 0: # ttl < 0 means infinite life print("Infinite ttl configured; leaving worker") return if worker['status'] == codes.READY and \ worker['last_execution'] + ttl < time.time(): # shutdown worker print("Shutting down worker beyond ttl.") shutdown_worker(worker['ch_name']) else: print("Worker still has life.")
def shutdown_workers(actor_id): """Graceful shutdown of all workers for an actor. Pass db_id as the `actor_id` argument.""" logger.debug("shutdown_workers() called for actor: {}".format(actor_id)) try: workers = Worker.get_workers(actor_id) except Exception as e: logger.error("Got exception from get_workers: {}".format(e)) if workers == {}: logger.info("shutdown_workers did not receive any workers from Worker.get_worker for actor: {}".format(actor_id)) # @TODO - this code is not thread safe. we need to update the workers state in a transaction: for _, worker in workers.items(): shutdown_worker(worker['id'])
def post(self, actor_id): def get_hypermedia(actor, exc): return {'_links': {'self': '{}/actors/v2/{}/executions/{}'.format(actor.api_server, actor.id, exc), 'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner), 'messages': '{}/actors/v2/{}/messages'.format(actor.api_server, actor.id)},} args = self.validate_post() d = {} # build a dictionary of k:v pairs from the query parameters, and pass a single # additional object 'message' from within the post payload. Note that 'message' # need not be JSON data. for k, v in request.args.items(): if k == 'message': continue d[k] = v if hasattr(g, 'user'): d['_abaco_username'] = g.user if hasattr(g, 'api_server'): d['_abaco_api_server'] = g.api_server # if hasattr(g, 'jwt'): # d['_abaco_jwt'] = g.jwt # if hasattr(g, 'jwt_server'): # d['_abaco_jwt_server'] = g.jwt_server if hasattr(g, 'jwt_header_name'): d['_abaco_jwt_header_name'] = g.jwt_header_name dbid = Actor.get_dbid(g.tenant, actor_id) # create an execution exc = Execution.add_execution(dbid, {'cpu': 0, 'io': 0, 'runtime': 0, 'status': SUBMITTED, 'executor': g.user}) d['_abaco_execution_id'] = exc d['_abaco_Content-Type'] = args.get('_abaco_Content-Type', '') ch = ActorMsgChannel(actor_id=dbid) ch.put_msg(message=args['message'], d=d) # make sure at least one worker is available workers = Worker.get_workers(dbid) actor = Actor.from_db(actors_store[dbid]) if len(workers.items()) < 1: ch = CommandChannel() ch.put_cmd(actor_id=dbid, image=actor.image, tenant=g.tenant, num=1, stop_existing=False) result={'execution_id': exc, 'msg': args['message']} result.update(get_hypermedia(actor, exc)) case = Config.get('web', 'case') if not case == 'camel': return ok(result) else: return ok(dict_to_camel(result))
def get(self, actor_id): dbid = Actor.get_dbid(g.tenant, actor_id) try: Actor.from_db(actors_store[dbid]) except KeyError: raise APIException("actor not found: {}'".format(actor_id), 400) try: workers = Worker.get_workers(dbid) except WorkerException as e: raise APIException(e.msg, 404) result = [] for id, worker in workers.items(): worker.update({'id': id}) result.append(worker) return ok(result=result, msg="Workers retrieved successfully.")
def execute_actor(actor_id, worker_ch, image, msg, d={}, privileged=False): result = {'cpu': 0, 'io': 0, 'runtime': 0 } cli = docker.AutoVersionClient(base_url=dd) d['MSG'] = msg binds = {} volumes = [] # if container is privileged, mount the docker daemon so that additional # containers can be started. if privileged: binds = {'/var/run/docker.sock':{ 'bind': '/var/run/docker.sock', 'ro': False }} volumes = ['/var/run/docker.sock'] host_config = cli.create_host_config(binds=binds, privileged=privileged) container = cli.create_container(image=image, environment=d, volumes=volumes, host_config=host_config) try: cli.start(container=container.get('Id')) except Exception as e: # if there was an error starting the container, user will need to debig raise DockerStartContainerError("Could not start container {}. Exception {}".format(container.get('Id'), str(e))) start = timeit.default_timer() Worker.update_worker_status(actor_id, worker_ch.name, BUSY) running = True # create a separate cli for checkin stats objects since these should be fast and we don't want to wait stats_cli = docker.AutoVersionClient(base_url=dd, timeout=1) try: stats_obj = stats_cli.stats(container=container.get('Id'), decode=True) except ReadTimeout: # if the container execution is so fast that the inital stats object cannot be created, # we skip the running loop and return a minimal stats object result['cpu'] = 1 result['runtime'] = 1 return result while running: try: print("waiting on a stats obj: {}".format(timeit.default_timer())) stats = next(stats_obj) except ReadTimeoutError: print("next(stats) just timed out: {}".format(timeit.default_timer())) # container stopped before another stats record could be read, just ignore and move on running = False break try: result['cpu'] += stats['cpu_stats']['cpu_usage']['total_usage'] result['io'] += stats['network']['rx_bytes'] except KeyError: # as of docker 1.9, the stats object returns bytes that must be decoded # and the network key is now 'networks' with multiple subkeys. print("got a stats obj: {}".format(timeit.default_timer())) if type(stats) == bytes: stats = json.loads(stats.decode("utf-8")) result['cpu'] += stats['cpu_stats']['cpu_usage']['total_usage'] # even running docker 1.9, there seems to be a race condition where the 'networks' key doesn't # always get populated. try: result['io'] += stats['networks']['eth0']['rx_bytes'] except KeyError: pass print("Recorded a stats obj:".format(timeit.default_timer())) if running: try: print("waiting on cli.wait: {}".format(timeit.default_timer())) cli.wait(container=container.get('Id'), timeout=1) print("container finished: {}".format(timeit.default_timer())) running = False except ReadTimeout: print("cli.wait just timed out: {}".format(timeit.default_timer())) # the wait timed out so check if we are beyond the max_run_time runtime = timeit.default_timer() - start if max_run_time > 0 and max_run_time < runtime: print("hit runtime limit: {}".format(timeit.default_timer())) cli.stop(container.get('Id')) running = False print("container stopped:{}".format(timeit.default_timer())) stop = timeit.default_timer() # get logs from container logs = cli.logs(container.get('Id')) # remove container, ignore errors try: cli.remove_container(container=container) print("Container removed.") except Exception as e: print("Exception trying to remove actor: {}".format(e)) result['runtime'] = int(stop - start) return result, logs
def subscribe(tenant, actor_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch): """ Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor containers when message arrive. Also subscribes to the worker channel for future communications. :return: """ actor_ch = ActorMsgChannel(actor_id) ag = None if api_server and client_id and client_secret and access_token and refresh_token: ag = Agave(api_server=api_server, token=access_token, refresh_token=refresh_token, api_key=client_id, api_secret=client_secret) else: print("Not creating agave client.") t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, actor_ch, ag)) t.start() print("Worker subscribing to actor channel...") global keep_running while keep_running: Worker.update_worker_status(actor_id, worker_ch.name, READY) try: msg = actor_ch.get(timeout=2) except channelpy.ChannelTimeoutException: continue except channelpy.ChannelClosedException: print("Channel closed, worker exiting...") keep_running = False sys.exit() print("Received message {}. Starting actor container...".format(str(msg))) # the msg object is a dictionary with an entry called message and an arbitrary # set of k:v pairs coming in from the query parameters. message = msg.pop('message', '') actor = Actor.from_db(actors_store[actor_id]) execution_id = msg['_abaco_execution_id'] privileged = False if actor['privileged'] == 'TRUE': privileged = True environment = actor['default_environment'] print("Actor default environment: {}".format(environment)) print("Actor privileged: {}".format(privileged)) # overlay the default_environment registered for the actor with the msg # dictionary environment.update(msg) environment['_abaco_access_token'] = '' # if we have an agave client, get a fresh set of tokens: if ag: try: ag.token.refresh() token = ag.token.token_info['access_token'] environment['_abaco_access_token'] = token print("Refreshed the tokens. Passed {} to the environment.".format(token)) except Exception as e: print("Got an exception trying to get an access token: {}".format(e)) else: print("Agave client `ag` is None -- not passing access token.") print("Passing update environment: {}".format(environment)) try: stats, logs = execute_actor(actor_id, worker_ch, image, message, environment, privileged) except DockerStartContainerError as e: print("Got DockerStartContainerError: {}".format(str(e))) Actor.set_status(actor_id, ERROR) continue # add the execution to the actor store print("Actor container finished successfully. Got stats object:{}".format(str(stats))) Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats) print("Added execution: {}".format(execution_id)) Execution.set_logs(execution_id, logs) Worker.update_worker_execution_time(actor_id, worker_ch.name)
def shutdown_workers(actor_id): """Graceful shutdown of all workers for an actor. Pass db_id as the `actor_id` argument.""" workers = Worker.get_workers(actor_id) for _, worker in workers.items(): shutdown_worker(worker['ch_name'])
def process(self, cmd): print("Processing cmd:{}".format(str(cmd))) actor_id = cmd['actor_id'] image = cmd['image'] tenant = cmd['tenant'] stop_existing = cmd.get('stop_existing', True) num_workers = cmd.get('num', self.num_workers) print("Actor id:{}".format(actor_id)) try: new_channels, anon_channels, new_workers = self.start_workers(actor_id, image, tenant, num_workers) except SpawnerException as e: # for now, start_workers will do clean up for a SpawnerException, so we just need # to return back to the run loop. return print("Created new workers: {}".format(str(new_workers))) # stop any existing workers: if stop_existing: self.stop_workers(actor_id) # add workers to store first so that the records will be there when the workers go # to update their status if not stop_existing: for _, worker in new_workers.items(): Worker.add_worker(actor_id, worker) else: workers_store[actor_id] = new_workers # send new workers their clients and tell them to subscribe to the actor channel. for idx, channel in enumerate(anon_channels): print("Getting client for worker {}".format(idx)) client_ch = ClientsChannel() client_msg = client_ch.request_client(tenant=tenant, actor_id=actor_id, # new_workers is a dictionary of dictionaries; list(d) creates a # list of keys for a dictionary d. hence, the idx^th entry # of list(ner_workers) should be the key. worker_id=new_workers[list(new_workers)[idx]]['ch_name'], secret=self.secret) # we need to ignore errors when generating clients because it's possible it is not set up for a specific # tenant. we log it instead. if client_msg.get('status') == 'error': print("Error generating client: {}".format(client_msg.get('message'))) channel.put({'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'no'}) # else, client was generated successfully: else: print("Got a client: {}, {}, {}".format(client_msg['client_id'], client_msg['access_token'], client_msg['refresh_token'])) channel.put({'status': 'ok', 'actor_id': actor_id, 'tenant': tenant, 'client': 'yes', 'client_id': client_msg['client_id'], 'client_secret': client_msg['client_secret'], 'access_token': client_msg['access_token'], 'refresh_token': client_msg['refresh_token'], 'api_server': client_msg['api_server'], }) print("Done processing command.")