def test_crud_execution(self): """It test basic CRUD operations of an Execution class""" # We verify that the object is not in the db after creating it execution = Execution() execution.execution_type = "execution_type" execution.status = "status" self.assertIsNone(execution.id) # We store the object in the db db.session.add(execution) # We recover the execution from the db execution = db.session.query(Execution).filter_by( execution_type="execution_type").first() self.assertIsNotNone(execution.id) self.assertEquals("execution_type", execution.execution_type) self.assertEquals("status", execution.status) # We check that we can update the execution execution.execution_type = "X" db.session.commit() execution_2 = db.session.query(Execution).filter_by( execution_type="X").first() self.assertEquals(execution.id, execution_2.id) self.assertEquals("X", execution.execution_type) # We check the delation db.session.delete(execution_2) count = db.session.query(Execution).filter_by( execution_type="X").count() self.assertEquals(0, count)
def subscribe(actor_id, worker_ch): """ Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor containers when message arrive. Also subscribes to the worker channel for future communications. :return: """ actor_ch = ActorMsgChannel(actor_id) t = threading.Thread(target=process_worker_ch, args=(worker_ch, actor_id, actor_ch)) t.start() print("Worker subscribing to actor channel...") while keep_running: update_worker_status(actor_id, worker_ch.name, READY) try: msg = actor_ch.get(timeout=2) except channelpy.ChannelTimeoutException: continue print("Received message {}. Starting actor container...".format(str(msg))) message = msg.pop("msg", "") try: stats, logs = execute_actor(actor_id, worker_ch, image, message, msg) except DockerStartContainerError as e: print("Got DockerStartContainerError: {}".format(str(e))) Actor.set_status(actor_id, ERROR) continue # add the execution to the actor store print("Actor container finished successfully. Got stats object:{}".format(str(stats))) exc_id = Execution.add_execution(actor_id, stats) Execution.set_logs(exc_id, logs)
def setUp(self): """ It creates the memory db """ db.create_all() # We store some Applications in the db for the tests application_1 = Application() application_1.name = 'AppName_1' application_2 = Application() application_2.name = 'AppName_2' # Adding executing scripts execution_script_1 = ExecutionConfiguration() execution_script_1.execution_type = "slurm:sbatch" execution_script_2 = ExecutionConfiguration() execution_script_2.execution_type = "slurm:sbatch2" application_2.execution_configurations = [ execution_script_1, execution_script_2 ] db.session.add(application_1) db.session.add(application_2) # We store some testbeds in the db for the tests testbed_1 = Testbed("name_1", True, "slurm", "ssh", "user@server", ['slurm']) testbed_2 = Testbed("name_2", False, "slurm", "ssh", "user@server", ['slurm']) testbed_3 = Testbed("name_3", True, "slurm", "ssh", "user@server", ['slurm', 'slurm:singularity']) db.session.add(testbed_1) db.session.add(testbed_2) db.session.add(testbed_3) db.session.commit() deployment = Deployment() deployment.executable_id = execution_script_1.id deployment.testbed_id = testbed_1.id db.session.add(deployment) # We store some nodes in the db for the tests node_1 = Node() node_1.name = "node_1" node_1.information_retrieved = True node_2 = Node() node_2.name = "node_2" node_2.information_retrieved = False db.session.add(node_1) db.session.add(node_2) execution = Execution() execution.execution_type = "execution_type" execution.status = "status" db.session.add(execution) db.session.commit()
def test_initialization_execution(self): """Test the initializacion method of the class Execution""" execution = Execution() execution.execution_type = "execution_type" execution.status = "status" self.assertEquals("execution_type", execution.execution_type) self.assertEquals("status", execution.status)
def post(self, actor_id): try: actor = Actor.from_db(actors_store[actor_id]) except KeyError: raise APIException( "actor not found: {}'".format(actor_id), 404) args = self.validate_post() Execution.add_execution(actor_id, args) return ok(result=actor, msg="Actor execution added successfully.")
def add_resource(execution): """ it adds resources to a running execution adapt_compss_resources <master_node> <master_job_id> CREATE SLURM-Cluster default <singularity_image> """ if (( execution.execution_type == execute_type_singularity_pm)) : logging.info("Executing type corresponds with SINGULARITY_PM, trying adaptation") if (( execution.status == Execution.__status_running__)) : url = execution.execution_configuration.testbed.endpoint scaling_upper_bound = execution.execution_configuration.application.scaling_upper_bound enqueue_env_file = execution.execution_configuration.testbed.extra_config['enqueue_env_file'] singularity_image_file = execution.execution_configuration.executable.singularity_image_file sbatch_id = execution.batch_id upper_bound_ok = True if ( scaling_upper_bound is not None ) and ( scaling_upper_bound != 0 ) : if scaling_upper_bound <= execution.get_number_extra_jobs() : upper_bound_ok = False if upper_bound_ok : node = find_first_node(sbatch_id, url) command = "source" params = [] params.append(enqueue_env_file) params.append(";") params.append("adapt_compss_resources") params.append(node) params.append(sbatch_id) params.append('CREATE SLURM-Cluster default') params.append(singularity_image_file) output = shell.execute_command(command, url, params) job_name = parse_add_resource_output(output) print(job_name) time.sleep(2) extra_job_id = get_job_id_after_adaptation(job_name, url) print(extra_job_id) if extra_job_id != '' or extra_job_id is not None : child = Execution() child.status = Execution.__status_running__ child.execution_type = execute_type_singularity_pm child.batch_id = extra_job_id execution.children.append(child) db.session.commit() time.sleep(5) __add_nodes_to_execution__(child, url) else : logging.info('Execution already reached its maximum number of extra jobs, no adaptation possible') else : logging.info("Execution is not in RUNNING status, no action can be done") else : logging.info("Execution: " + execution.execution_type + " it is not compatible with add resource action")
def post(self, actor_id): id = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[id]) except KeyError: raise ResourceError("actor not found: {}'".format(actor_id), 404) args = self.validate_post() Execution.add_execution(id, args) return ok(result=actor.display(), msg="Actor execution added successfully.")
def test_child_parent_relationship(self): """ It tests the child parent relationship between several executions """ parent = Execution() parent.status = "x1" db.session.add(parent) db.session.commit() # Empty list of children parent = db.session.query(Execution).filter_by(status="x1").first() self.assertEquals(0, len(parent.children)) # We add childer child_1 = Execution() child_1.status = "x2" parent.children.append(child_1) child_2 = Execution() child_2.status = "x3" parent.children.append(child_2) db.session.commit() parent = db.session.query(Execution).filter_by(status="x1").first() self.assertEquals(2, len(parent.children)) self.assertEquals(child_1, parent.children[0]) self.assertEquals(child_2, parent.children[1]) child_1 = db.session.query(Execution).filter_by(status="x2").first() self.assertEquals(parent, child_1.parent) child_2 = db.session.query(Execution).filter_by(status="x3").first() self.assertEquals(parent, child_2.parent)
def save_execution_plan(feature_id): feature = Feature.objects.get(pk=feature_id) if feature.executionLock: return "execution already started" else: feature.lock_feature() execution = Execution() workspace = WorkSpace.objects.get(pk=feature.workspace) execution.fill(workspace, "planed", "hardcode-executor") execution.save() return "ok"
def createTables(): try: Execution.create_table() print("Tabela 'Execution' criada com sucesso!") except peewee.OperationalError: print("Tabela 'Execution' ja existe!") try: ExecutionItem.create_table() print("Tabela 'ExecutionItem' criada com sucesso!") except peewee.OperationalError: print("Tabela 'ExecutionItem' ja existe!")
def post(self, actor_id): logger.debug("top of POST /actors/{}/executions".format(actor_id)) id = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[id]) except KeyError: logger.debug("did not find actor: {}.".format(actor_id)) raise ResourceError( "No actor found with id: {}.".format(actor_id), 404) args = self.validate_post() logger.debug("execution post args validated: {}.".format(actor_id)) Execution.add_execution(id, args) logger.info("execution added: {}.".format(actor_id)) return ok(result=actor.display(), msg="Actor execution added successfully.")
def test_many_to_many_relations_with_nodes(self): """ It tests the many to many relations with Nodes """ node_1 = Node() node_1.name = "node1" node_1.information_retrieved = False node_2 = Node() node_2.name = "node2" node_2.information_retrieved = False db.session.add(node_1) db.session.add(node_2) execution_1 = Execution() execution_1.status = "x1" execution_2 = Execution() execution_2.status = "x2" db.session.add(execution_1) db.session.add(execution_2) db.session.commit() execution_1.nodes = [node_1, node_2] execution_2.nodes = [node_2, node_1] db.session.commit() execution = db.session.query(Execution).filter_by(status="x1").first() self.assertEquals(node_1, execution.nodes[0]) self.assertEquals(node_2, execution.nodes[1]) execution = db.session.query(Execution).filter_by(status="x2").first() self.assertEquals(node_2, execution.nodes[0]) self.assertEquals(node_1, execution.nodes[1])
def get(self, actor_id, execution_id): def get_hypermedia(actor, exc): return {'_links': {'self': '{}/actors/v2/{}/executions/{}/logs'.format(actor.api_server, actor.id, exc.id), 'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner), 'execution': '{}/actors/v2/{}/executions/{}'.format(actor.api_server, actor.id, exc.id)}, } dbid = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[dbid]) except KeyError: raise APIException( "actor not found: {}'".format(actor_id), 404) try: excs = executions_store[dbid] except KeyError: raise APIException("No executions found for actor {}.".format(actor_id)) try: exc = Execution.from_db(excs[execution_id]) except KeyError: raise APIException("Execution not found {}.".format(execution_id)) try: logs = logs_store[execution_id] except KeyError: logs = "" result={'logs': logs} result.update(get_hypermedia(actor, exc)) return ok(result, msg="Logs retrieved successfully.")
def setUp(self): """ It creates the model objects and saves then in the database """ super(RankingTests, self).setUp() self.execution = Execution() self.execution.slurm_sbatch_id = 2333 execution_configuration = ExecutionConfiguration() execution_configuration.id = 22 self.execution.execution_configuration = execution_configuration application = Application() application.name = "Matmul" execution_configuration.application = application testbed = Testbed("nova", True, "SLURM", "SSH", "*****@*****.**", ["SINGULARITY"]) execution_configuration.testbed = testbed db.session.add(testbed) db.session.add(application) db.session.add(execution_configuration) db.session.add(self.execution) db.session.commit()
def get(self, actor_id, execution_id): def get_hypermedia(actor, exc): return {'_links': {'self': '{}/actors/v2/{}/executions/{}/logs'.format(actor.api_server, actor.id, exc.id), 'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner), 'execution': '{}/actors/v2/{}/executions/{}'.format(actor.api_server, actor.id, exc.id)}, } logger.debug("top of GET /actors/{}/executions/{}/logs.".format(actor_id, execution_id)) dbid = Actor.get_dbid(g.tenant, actor_id) try: actor = Actor.from_db(actors_store[dbid]) except KeyError: logger.debug("did not find actor: {}.".format(actor_id)) raise ResourceError( "No actor found with id: {}.".format(actor_id), 404) try: excs = executions_store[dbid] except KeyError: logger.debug("did not find executions. actor: {}.".format(actor_id)) raise ResourceError("No executions found for actor {}.".format(actor_id)) try: exc = Execution.from_db(excs[execution_id]) except KeyError: logger.debug("did not find execution: {}. actor: {}.".format(execution_id, actor_id)) raise ResourceError("Execution {} not found.".format(execution_id)) try: logs = logs_store[execution_id] except KeyError: logger.debug("did not find logs. execution: {}. actor: {}.".format(execution_id, actor_id)) logs = "" result={'logs': logs} result.update(get_hypermedia(actor, exc)) return ok(result, msg="Logs retrieved successfully.")
def execute_application_type_slurm_sbatch(execution, identifier): """ Executes an application with a device supervisor configured for slurm sbatch """ execution_configuration, testbed, deployment, executable = __get_srun_info__( execution, identifier) if testbed.category != Testbed.slurm_category: # If the category is not SLURM we can not execute the app execution.status = execute_status_failed execution.output = "Testbed does not support " + execute_type_slurm_sbatch + " applications" db.session.commit() elif not testbed.on_line: # If the testbed is off-line is not SLURM we can not execute the app execution.status = execute_status_failed execution.output = "Testbed is off-line" db.session.commit() else: # Preparing the command to be executed command = "sbatch" endpoint = testbed.endpoint params = [] params.append(executable.executable_file) logging.info("Launching execution of application: command: " + command + " | endpoint: " + endpoint + " | params: " + str(params)) output = shell.execute_command(command, endpoint, params) print(output) sbatch_id = __extract_id_from_sbatch__(output) execution = Execution() execution.execution_type = execution_configuration.execution_type execution.status = Execution.__status_running__ execution_configuration.executions.append(execution) execution.slurm_sbatch_id = sbatch_id db.session.commit() # Add nodes __add_nodes_to_execution__(execution, endpoint)
def post(self, actor_id): def get_hypermedia(actor, exc): return { '_links': { 'self': '{}/actors/v2/{}/executions/{}'.format( actor.api_server, actor.id, exc), 'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner), 'messages': '{}/actors/v2/{}/messages'.format(actor.api_server, actor.id) }, } args = self.validate_post() d = {} # build a dictionary of k:v pairs from the query parameters, and pass a single # additional object 'message' from within the post payload. Note that 'message' # need not be JSON data. for k, v in request.args.items(): if k == 'message': continue d[k] = v if hasattr(g, 'user'): d['_abaco_username'] = g.user if hasattr(g, 'api_server'): d['_abaco_api_server'] = g.api_server # if hasattr(g, 'jwt'): # d['_abaco_jwt'] = g.jwt # if hasattr(g, 'jwt_server'): # d['_abaco_jwt_server'] = g.jwt_server if hasattr(g, 'jwt_header_name'): d['_abaco_jwt_header_name'] = g.jwt_header_name dbid = Actor.get_dbid(g.tenant, actor_id) # create an execution exc = Execution.add_execution( dbid, { 'cpu': 0, 'io': 0, 'runtime': 0, 'status': SUBMITTED, 'executor': g.user }) d['_abaco_execution_id'] = exc d['_abaco_Content-Type'] = args.get('_abaco_Content-Type', '') ch = ActorMsgChannel(actor_id=dbid) ch.put_msg(message=args['message'], d=d) # make sure at least one worker is available actor = Actor.from_db(actors_store[dbid]) actor.ensure_one_worker() result = {'execution_id': exc, 'msg': args['message']} result.update(get_hypermedia(actor, exc)) case = Config.get('web', 'case') if not case == 'camel': return ok(result) else: return ok(dict_to_camel(result))
def inserTablesData(): execution_1 = Execution.create(input='JAN21-A', outputs='none') execution_2 = Execution.create(input='JAN21-B', outputs='noneA') execution_3 = Execution.create(input='JAN23-C', outputs='nonec') executionItem_1 = { 'status': 'STARTED', 'params': 'nada', 'result': '', 'error': '', 'execution_id': execution_1 } executionItem_2 = { 'status': 'SUCCESS', 'params': 'nada', 'result': '', 'error': '', 'execution_id': execution_1 } executionItem_3 = { 'status': 'STARTED', 'params': 'nada', 'result': '', 'error': '', 'execution_id': execution_2 } executionItem_4 = { 'status': 'STARTED', 'params': 'nada', 'result': '', 'error': '', 'execution_id': execution_3 } arrayOfExecutionItens = [ executionItem_1, executionItem_2, executionItem_3, executionItem_4 ] ExecutionItem.insert_many(arrayOfExecutionItens).execute()
def __parse_output__(output, endpoint, execution_configuration, child_execution=None): """ It parses output and adds nodes to the execution """ sbatch_id = __extract_id_from_squeue__(output) execution = None if child_execution : execution = child_execution else : execution = Execution() execution.execution_type = execution_configuration.execution_type execution_configuration.executions.append(execution) execution.status = Execution.__status_running__ execution.batch_id = sbatch_id db.session.commit() # Add nodes __add_nodes_to_execution__(execution, endpoint)
def stop_execution(execution): """ It stops a checkpointable execution """ if Application.CHECKPOINTABLE == execution.execution_configuration.application.application_type: child = None if execution.status == Execution.__status_running__: child = Execution() child.status = Execution.__status_running__ child.execution_configuration = execution.execution_configuration child.execution_type = execution.execution_configuration.execution_type child.slurm_sbatch_id = execution.slurm_sbatch_id execution.slurm_sbatch_id = -1 execution.children.append(child) else: child = next( filter( lambda child: child.status == Execution.__status_running__, execution.children)) # Only one execution can be running execution.status = Execution.__status_stopped__ db.session.commit() cancel_execution(child, execution.execution_configuration.testbed.endpoint) else: slurm.stop_execution( execution.slurm_sbatch_id, execution.execution_configuration.testbed.endpoint)
def post(self, actor_id): def get_hypermedia(actor, exc): return {'_links': {'self': '{}/actors/v2/{}/executions/{}'.format(actor.api_server, actor.id, exc), 'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner), 'messages': '{}/actors/v2/{}/messages'.format(actor.api_server, actor.id)},} args = self.validate_post() d = {} # build a dictionary of k:v pairs from the query parameters, and pass a single # additional object 'message' from within the post payload. Note that 'message' # need not be JSON data. for k, v in request.args.items(): if k == 'message': continue d[k] = v if hasattr(g, 'user'): d['_abaco_username'] = g.user if hasattr(g, 'api_server'): d['_abaco_api_server'] = g.api_server # if hasattr(g, 'jwt'): # d['_abaco_jwt'] = g.jwt # if hasattr(g, 'jwt_server'): # d['_abaco_jwt_server'] = g.jwt_server if hasattr(g, 'jwt_header_name'): d['_abaco_jwt_header_name'] = g.jwt_header_name dbid = Actor.get_dbid(g.tenant, actor_id) # create an execution exc = Execution.add_execution(dbid, {'cpu': 0, 'io': 0, 'runtime': 0, 'status': SUBMITTED, 'executor': g.user}) d['_abaco_execution_id'] = exc d['_abaco_Content-Type'] = args.get('_abaco_Content-Type', '') ch = ActorMsgChannel(actor_id=dbid) ch.put_msg(message=args['message'], d=d) # make sure at least one worker is available workers = Worker.get_workers(dbid) actor = Actor.from_db(actors_store[dbid]) if len(workers.items()) < 1: ch = CommandChannel() ch.put_cmd(actor_id=dbid, image=actor.image, tenant=g.tenant, num=1, stop_existing=False) result={'execution_id': exc, 'msg': args['message']} result.update(get_hypermedia(actor, exc)) case = Config.get('web', 'case') if not case == 'camel': return ok(result) else: return ok(dict_to_camel(result))
def get(self, actor_id, execution_id): dbid = Actor.get_dbid(g.tenant, actor_id) try: actors_store[dbid] except KeyError: raise APIException( "actor not found: {}'".format(actor_id), 404) try: excs = executions_store[dbid] except KeyError: raise APIException("No executions found for actor {}.".format(actor_id)) try: exc = Execution.from_db(excs[execution_id]) except KeyError: raise APIException("Execution not found {}.".format(execution_id)) return ok(result=exc.display(), msg="Actor execution retrieved successfully.")
def get(self, actor_id, execution_id): dbid = Actor.get_dbid(g.tenant, actor_id) try: actors_store[dbid] except KeyError: raise ResourceError("actor not found: {}'".format(actor_id), 404) try: excs = executions_store[dbid] except KeyError: raise ResourceError( "No executions found for actor {}.".format(actor_id)) try: exc = Execution.from_db(excs[execution_id]) except KeyError: raise ResourceError("Execution not found {}.".format(execution_id)) return ok(result=exc.display(), msg="Actor execution retrieved successfully.")
def execute_application(execution_configuration, create_profile=False, use_stored_profile=False): """ This function executes an application in the selected testbed, using the execution script configuration. """ # We create the execution execution = Execution() execution.execution_type = execution_configuration.execution_type execution.status = execute_status_submitted profile_folder = app.config['APP_PROFILE_FOLDER'] db.session.add(execution) db.session.commit() # We verify that we recoginze the type of execution if execution.execution_type == execute_type_slurm_sbatch: t = Thread(target=execute_application_type_slurm_sbatch, args=(execution, execution_configuration.id)) t.start() return t elif execution.execution_type == execute_type_singularity_pm: t = Thread(target=execute_application_type_singularity_pm, args=(execution, execution_configuration.id, create_profile, use_stored_profile, profile_folder)) t.start() return t elif execution.execution_type == execute_type_singularity_srun: t = Thread(target=execute_application_type_singularity_srun, args=(execution, execution_configuration.id)) t.start() return t elif execution.execution_type == execute_type_slurm_srun: t = Thread(target=execute_application_type_slurm_srun, args=(execution, execution_configuration.id)) t.start() return t elif execution.execution_type == Executable.__type_pm__: t = Thread(target=execute_application_type_pm, args=(execution, execution_configuration.id, create_profile, use_stored_profile, profile_folder)) t.start() return t else: execution.status = execute_status_failed execution.output = "No support for execurtion type: " + execution.execution_type db.session.commit()
def get(self, actor_id, execution_id): logger.debug("top of GET /actors/{}/executions/{}.".format(actor_id, execution_id)) dbid = Actor.get_dbid(g.tenant, actor_id) try: actors_store[dbid] except KeyError: logger.debug("did not find actor: {}.".format(actor_id)) raise ResourceError( "No actor found with id: {}.".format(actor_id), 404) try: excs = executions_store[dbid] except KeyError: logger.debug("did not find executions: {}.".format(actor_id)) raise ResourceError("No executions found for actor {}.".format(actor_id)) try: exc = Execution.from_db(excs[execution_id]) except KeyError: logger.debug("did not find execution: {}. actor: {}.".format(execution_id, actor_id)) raise ResourceError("Execution not found {}.".format(execution_id)) return ok(result=exc.display(), msg="Actor execution retrieved successfully.")
def process_link(link, msg, d): """ Process an event with a link. :return: """ # ensure that the linked actor still exists; the link attribute is *always* the dbid of the linked # actor logger.debug("top of process_link") try: actors_store[link] except KeyError as e: logger.error( "Processing event message for actor {} that does not exist. Quiting" .format(link)) raise e # create an execution for the linked actor with message exc = Execution.add_execution( link, { 'cpu': 0, 'io': 0, 'runtime': 0, 'status': SUBMITTED, 'executor': 'Abaco Event' }) logger.info( "Events processor agent added execution {} for actor {}".format( exc, link)) d['_abaco_execution_id'] = exc logger.debug( "sending message to actor. Final message {} and message dictionary: {}" .format(msg, d)) ch = ActorMsgChannel(actor_id=link) ch.put_msg(message=msg, d=d) ch.close() logger.info("link processed.")
def restart_execution(execution): """ It stops a checkpointable execution """ # We create the execution child = Execution() child.execution_type = execution.execution_configuration.execution_type child.status = Execution.__status_submitted__ execution.children.append(child) execution.status = Execution.__status_restarted__ db.session.commit() if execution.execution_configuration.execution_type == execute_type_slurm_srun : execute_application_type_slurm_srun(child, execution.execution_configuration_id, True) child.status = Execution.__status_running__ db.session.commit() else : child.status = Execution.__status_failed__ db.session.commit()
def subscribe(tenant, actor_id, image, worker_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch): """ Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor containers when message arrive. Also launches a separate thread which ultimately subscribes to the worker channel for future communications. :return: """ logger.debug("Top of subscribe(). worker_id: {}".format(worker_id)) actor_ch = ActorMsgChannel(actor_id) # establish configs for this worker ------- try: leave_containers = Config.get('workers', 'leave_containers') except configparser.NoOptionError: logger.debug("No leave_containers value configured.") leave_containers = False if hasattr(leave_containers, 'lower'): leave_containers = leave_containers.lower() == "true" logger.debug("leave_containers: {}".format(leave_containers)) try: mem_limit = Config.get('workers', 'mem_limit') except configparser.NoOptionError: logger.debug("No mem_limit value configured.") mem_limit = "-1" mem_limit = str(mem_limit) try: max_cpus = Config.get('workers', 'max_cpus') except configparser.NoOptionError: logger.debug("No max_cpus value configured.") max_cpus = "-1" logger.debug("max_cpus: {}".format(max_cpus)) # instantiate an OAuth client python object if credentials were passed ----- ag = None if api_server and client_id and client_secret and access_token and refresh_token: logger.info("Creating agave client.") verify = get_tenant_verify(tenant) ag = Agave(api_server=api_server, token=access_token, refresh_token=refresh_token, api_key=client_id, api_secret=client_secret, verify=verify) else: logger.info("Not creating agave client.") # start a separate thread for handling messages sent to the worker channel ---- logger.info("Starting the process worker channel thread.") t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, worker_id, actor_ch, ag)) t.start() # subscribe to the actor message queue ----- logger.info( "Worker subscribing to actor channel. worker_id: {}".format(worker_id)) # keep track of whether we need to update the worker's status back to READY; otherwise, we # will hit redis with an UPDATE every time the subscription loop times out (i.e., every 2s) update_worker_status = True # global tracks whether this worker should keep running. globals.keep_running = True # consecutive_errors tracks the number of consecutive times a worker has gotten an error trying to process a # message. Even though the message will be requeued, we do not want the worker to continue processing # indefinitely when a compute node is unhealthy. consecutive_errors = 0 # main subscription loop -- processing messages from actor's mailbox while globals.keep_running: logger.debug("top of keep_running; worker id: {}".format(worker_id)) if update_worker_status: Worker.update_worker_status(actor_id, worker_id, READY) logger.debug( "updated worker status to READY in SUBSCRIBE; worker id: {}". format(worker_id)) update_worker_status = False try: msg, msg_obj = actor_ch.get_one() except channelpy.ChannelClosedException: logger.info("Channel closed, worker exiting. worker id: {}".format( worker_id)) globals.keep_running = False sys.exit() logger.info("worker {} processing new msg.".format(worker_id)) try: Worker.update_worker_status(actor_id, worker_id, BUSY) except Exception as e: logger.error( "unexpected exception from call to update_worker_status. Nacking message." "actor_id: {}; worker_id: {}; status: {}; exception: {}". format(actor_id, worker_id, BUSY, e)) logger.info("worker exiting. worker_id: {}".format(worker_id)) msg_obj.nack(requeue=True) raise e update_worker_status = True logger.info( "Received message {}. Starting actor container. worker id: {}". format(msg, worker_id)) # the msg object is a dictionary with an entry called message and an arbitrary # set of k:v pairs coming in from the query parameters. message = msg.pop('message', '') try: actor = Actor.from_db(actors_store[actor_id]) execution_id = msg['_abaco_execution_id'] content_type = msg['_abaco_Content_Type'] mounts = actor.mounts logger.debug("actor mounts: {}".format(mounts)) except Exception as e: logger.error( "unexpected exception retrieving actor, execution, content-type, mounts. Nacking message." "actor_id: {}; worker_id: {}; status: {}; exception: {}". format(actor_id, worker_id, BUSY, e)) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e # for results, create a socket in the configured directory. try: socket_host_path_dir = Config.get('workers', 'socket_host_path_dir') except (configparser.NoSectionError, configparser.NoOptionError) as e: logger.error( "No socket_host_path configured. Cannot manage results data. Nacking message" ) Actor.set_status( actor_id, ERROR, status_message="Abaco instance not configured for results data." ) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e socket_host_path = '{}.sock'.format( os.path.join(socket_host_path_dir, worker_id, execution_id)) logger.info("Create socket at path: {}".format(socket_host_path)) # add the socket as a mount: mounts.append({ 'host_path': socket_host_path, 'container_path': '/_abaco_results.sock', 'format': 'ro' }) # for binary data, create a fifo in the configured directory. The configured # fifo_host_path_dir is equal to the fifo path in the worker container: fifo_host_path = None if content_type == 'application/octet-stream': try: fifo_host_path_dir = Config.get('workers', 'fifo_host_path_dir') except (configparser.NoSectionError, configparser.NoOptionError) as e: logger.error( "No fifo_host_path configured. Cannot manage binary data.") Actor.set_status( actor_id, ERROR, status_message= "Abaco instance not configured for binary data. Nacking message." ) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e fifo_host_path = os.path.join(fifo_host_path_dir, worker_id, execution_id) try: os.mkfifo(fifo_host_path) logger.info("Created fifo at path: {}".format(fifo_host_path)) except Exception as e: logger.error( "Could not create fifo_path. Nacking message. Exception: {}" .format(e)) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e # add the fifo as a mount: mounts.append({ 'host_path': fifo_host_path, 'container_path': '/_abaco_binary_data', 'format': 'ro' }) # the execution object was created by the controller, but we need to add the worker id to it now that we # know which worker will be working on the execution. logger.debug( "Adding worker_id to execution. woker_id: {}".format(worker_id)) try: Execution.add_worker_id(actor_id, execution_id, worker_id) except Exception as e: logger.error( "Unexpected exception adding working_id to the Execution. Nacking message. Exception: {}" .format(e)) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e # privileged dictates whether the actor container runs in privileged mode and if docker daemon is mounted. privileged = False if type(actor['privileged']) == bool and actor['privileged']: privileged = True logger.debug("privileged: {}; worker_id: {}".format( privileged, worker_id)) # overlay resource limits if set on actor: if actor.mem_limit: mem_limit = actor.mem_limit if actor.max_cpus: max_cpus = actor.max_cpus # retrieve the default environment registered with the actor. environment = actor['default_environment'] logger.debug("Actor default environment: {}".format(environment)) # construct the user field from the actor's uid and gid: user = get_container_user(actor) logger.debug("Final user valiue: {}".format(user)) # overlay the default_environment registered for the actor with the msg # dictionary environment.update(msg) environment['_abaco_access_token'] = '' environment['_abaco_actor_dbid'] = actor_id environment['_abaco_actor_id'] = actor.id environment['_abaco_worker_id'] = worker_id environment['_abaco_container_repo'] = actor.image environment['_abaco_actor_state'] = actor.state environment['_abaco_actor_name'] = actor.name or 'None' logger.debug("Overlayed environment: {}; worker_id: {}".format( environment, worker_id)) # if we have an agave client, get a fresh set of tokens: if ag: try: ag.token.refresh() token = ag.token.token_info['access_token'] environment['_abaco_access_token'] = token logger.info( "Refreshed the tokens. Passed {} to the environment.". format(token)) except Exception as e: logger.error( "Got an exception trying to get an access token. Stoping worker and nacking message. " "Exception: {}".format(e)) msg_obj.nack(requeue=True) logger.info("worker exiting. worker_id: {}".format(worker_id)) raise e else: logger.info( "Agave client `ag` is None -- not passing access token; worker_id: {}" .format(worker_id)) logger.info("Passing update environment: {}".format(environment)) logger.info("About to execute actor; worker_id: {}".format(worker_id)) try: stats, logs, final_state, exit_code, start_time = execute_actor( actor_id, worker_id, execution_id, image, message, user, environment, privileged, mounts, leave_containers, fifo_host_path, socket_host_path, mem_limit, max_cpus) except DockerStartContainerError as e: logger.error( "Worker {} got DockerStartContainerError: {} trying to start actor for execution {}." "Placing message back on queue.".format( worker_id, e, execution_id)) # if we failed to start the actor container, we leave the worker up and re-queue the original message msg_obj.nack(requeue=True) logger.debug('message requeued.') consecutive_errors += 1 if consecutive_errors > MAX_WORKER_CONSECUTIVE_ERRORS: logger.error( "Worker {} failed to successfully start actor for execution {} {} consecutive times; " "Exception: {}. Putting the actor in error status and shutting " "down workers.".format(worker_id, execution_id, MAX_WORKER_CONSECUTIVE_ERRORS, e)) Actor.set_status(actor_id, ERROR, "Error executing container: {}; w".format(e)) shutdown_workers(actor_id, delete_actor_ch=False) # wait for worker to be shutdown.. time.sleep(60) break else: # sleep five seconds before getting a message again to give time for the compute # node and/or docker health to recover time.sleep(5) continue except DockerStopContainerError as e: logger.error( "Worker {} was not able to stop actor for execution: {}; Exception: {}. " "Putting the actor in error status and shutting down workers.". format(worker_id, execution_id, e)) Actor.set_status(actor_id, ERROR, "Error executing container: {}".format(e)) # since the error was with stopping the actor, we will consider this message "processed"; this choice # could be reconsidered/changed msg_obj.ack() shutdown_workers(actor_id, delete_actor_ch=False) # wait for worker to be shutdown.. time.sleep(60) break except Exception as e: logger.error( "Worker {} got an unexpected exception trying to run actor for execution: {}." "Putting the actor in error status and shutting down workers. " "Exception: {}; type: {}".format(worker_id, execution_id, e, type(e))) Actor.set_status(actor_id, ERROR, "Error executing container: {}".format(e)) # the execute_actor function raises a DockerStartContainerError if it met an exception before starting the # actor container; if the container was started, then another exception should be raised. Therefore, # we can assume here that the container was at least started and we can ack the message. msg_obj.ack() shutdown_workers(actor_id, delete_actor_ch=False) # wait for worker to be shutdown.. time.sleep(60) break # ack the message msg_obj.ack() logger.debug( "container finished successfully; worker_id: {}".format(worker_id)) # Add the completed stats to the execution logger.info( "Actor container finished successfully. Got stats object:{}". format(str(stats))) Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats, final_state, exit_code, start_time) logger.info("Added execution: {}; worker_id: {}".format( execution_id, worker_id)) # Add the logs to the execution try: Execution.set_logs(execution_id, logs) logger.debug("Successfully added execution logs.") except Exception as e: msg = "Got exception trying to set logs for exception {}; " \ "Exception: {}; worker_id: {}".format(execution_id, e, worker_id) logger.error(msg) # Update the worker's last updated and last execution fields: try: Worker.update_worker_execution_time(actor_id, worker_id) logger.debug("worker execution time updated. worker_id: {}".format( worker_id)) except KeyError: # it is possible that this worker was sent a gracful shutdown command in the other thread # and that spawner has already removed this worker from the store. logger.info( "worker {} got unexpected key error trying to update its execution time. " "Worker better be shutting down! keep_running: {}".format( worker_id, globals.keep_running)) if globals.keep_running: logger.error( "worker couldn't update's its execution time but keep_running is still true!" ) # we completed an execution successfully; reset the consecutive_errors counter consecutive_errors = 0 logger.info( "worker time stamps updated; worker_id: {}".format(worker_id)) logger.info( "global.keep_running no longer true. worker is now exited. worker id: {}" .format(worker_id))
def __execute_pm_applications__(execution, identifier, create_profile, use_storage_profile, profile_folder, singularity): """ It executes a Singularity PM application in a targatted testbed """ # If create_profile = True we need to create a profile and associate it with the execution profile_file = '' if create_profile: profile_file = profile_folder + '/' + str(uuid.uuid4()) + '.profile' # Lets recover all the information needed...execution_configuration execution_configuration = db.session.query( ExecutionConfiguration).filter_by(id=identifier).first( ) # This is to avoid reusing objects from other thread testbed = db.session.query(Testbed).filter_by( id=execution_configuration.testbed_id).first() deployment = db.session.query(Deployment).filter_by( executable_id=execution_configuration.executable_id, testbed_id=testbed.id).first() executable = db.session.query(Executable).filter_by( id=execution_configuration.executable_id).first() # Preparing the command to be executed command = "source" endpoint = testbed.endpoint params = [] params.append(testbed.extra_config['enqueue_env_file']) params.append(";") params.append("enqueue_compss") params.append("--sc_cfg=" + testbed.extra_config['enqueue_compss_sc_cfg']) params.append("--num_nodes=" + str(execution_configuration.num_nodes)) params.append("--gpus_per_node=" + str(execution_configuration.num_gpus_per_node)) params.append("--cpus_per_node=" + str(execution_configuration.num_cpus_per_node)) if singularity: params.append("--container_image=" + deployment.path) params.append( "--container_compss_path=/opt/TANGO/TANGO_ProgrammingModel/COMPSs/" ) # TODO Ugly... ugly... and more ugly... #params.append("--appdir=" + executable.singularity_app_folder) params.append( "--appdir=/apps/application/") # TODO Ugly... fix this... else: params.append("--appdir=" + executable.singularity_app_folder) params.append("--exec_time=" + str(execution_configuration.exec_time)) # If create profile if create_profile: params.append("--output_profile=" + profile_file) # If we use a profile --output_profile=<path> if use_storage_profile: params.append("--input_profile=" + execution_configuration.profile_file) params.append(execution_configuration.compss_config) params.append(execution_configuration.command) logging.info("Launching execution of application: command: " + command + " | endpoint: " + endpoint + " | params: " + str(params)) output = shell.execute_command(command, endpoint, params) sbatch_id = __extract_id_from_sigularity_pm_app__(output) execution = Execution() execution.execution_type = execution_configuration.execution_type execution.status = Execution.__status_running__ execution_configuration.executions.append(execution) # if we create the profile, we add it to the execution configuration if create_profile: execution_configuration.profile_file = profile_file execution.slurm_sbatch_id = sbatch_id db.session.commit() # Add nodes time.sleep(5) __add_nodes_to_execution__(execution, endpoint)
def subscribe(tenant, actor_id, worker_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch): """ Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor containers when message arrive. Also subscribes to the worker channel for future communications. :return: """ logger.debug("Top of subscribe().") actor_ch = ActorMsgChannel(actor_id) try: leave_containers = Config.get('workers', 'leave_containers') except configparser.NoOptionError: logger.info("No leave_containers value confiured.") leave_containers = False if hasattr(leave_containers, 'lower'): leave_containers = leave_containers.lower() == "true" logger.info("leave_containers: {}".format(leave_containers)) ag = None if api_server and client_id and client_secret and access_token and refresh_token: logger.info("Creating agave client.") verify = get_tenant_verify(tenant) ag = Agave(api_server=api_server, token=access_token, refresh_token=refresh_token, api_key=client_id, api_secret=client_secret, verify=verify) else: logger.info("Not creating agave client.") logger.info("Starting the process worker channel thread.") t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, worker_id, actor_ch, ag)) t.start() logger.info("Worker subscribing to actor channel.") # keep track of whether we need to update the worker's status back to READY; otherwise, we # will hit redis with an UPDATE every time the subscription loop times out (i.e., every 2s) update_worker_status = True # shared global tracking whether this worker should keep running; shared between this thread and # the "worker channel processing" thread. global keep_running # main subscription loop -- processing messages from actor's mailbox while keep_running: if update_worker_status: Worker.update_worker_status(actor_id, worker_id, READY) update_worker_status = False try: msg = actor_ch.get_one() except channelpy.ChannelClosedException: logger.info("Channel closed, worker exiting...") keep_running = False sys.exit() logger.info("worker {} processing new msg.".format(worker_id)) try: Worker.update_worker_status(actor_id, worker_id, BUSY) except Exception as e: logger.error("unexpected exception from call to update_worker_status." "actor_id: {}; worker_id: {}; status: {}; exception: {}".format(actor_id, worker_id, BUSY, e)) raise e update_worker_status = True logger.info("Received message {}. Starting actor container...".format(msg)) # the msg object is a dictionary with an entry called message and an arbitrary # set of k:v pairs coming in from the query parameters. message = msg.pop('message', '') actor = Actor.from_db(actors_store[actor_id]) execution_id = msg['_abaco_execution_id'] content_type = msg['_abaco_Content_Type'] mounts = actor.mounts logger.debug("actor mounts: {}".format(mounts)) # for results, create a socket in the configured directory. try: socket_host_path_dir = Config.get('workers', 'socket_host_path_dir') except (configparser.NoSectionError, configparser.NoOptionError): logger.error("No socket_host_path configured. Cannot manage results data.") Actor.set_status(actor_id, ERROR, msg="Abaco instance not configured for results data.") continue socket_host_path = '{}.sock'.format(os.path.join(socket_host_path_dir, worker_id, execution_id)) logger.info("Create socket at path: {}".format(socket_host_path)) # add the socket as a mount: mounts.append({'host_path': socket_host_path, 'container_path': '/_abaco_results.sock', 'format': 'ro'}) # for binary data, create a fifo in the configured directory. The configured # fifo_host_path_dir is equal to the fifo path in the worker container: fifo_host_path = None if content_type == 'application/octet-stream': try: fifo_host_path_dir = Config.get('workers', 'fifo_host_path_dir') except (configparser.NoSectionError, configparser.NoOptionError): logger.error("No fifo_host_path configured. Cannot manage binary data.") Actor.set_status(actor_id, ERROR, msg="Abaco instance not configured for binary data.") continue fifo_host_path = os.path.join(fifo_host_path_dir, worker_id, execution_id) try: os.mkfifo(fifo_host_path) logger.info("Created fifo at path: {}".format(fifo_host_path)) except Exception as e: logger.error("Could not create fifo_path. Exception: {}".format(e)) raise e # add the fifo as a mount: mounts.append({'host_path': fifo_host_path, 'container_path': '/_abaco_binary_data', 'format': 'ro'}) # the execution object was created by the controller, but we need to add the worker id to it now that we # know which worker will be working on the execution. logger.debug("Adding worker_id to execution.") Execution.add_worker_id(actor_id, execution_id, worker_id) # privileged dictates whether the actor container runs in privileged mode and if docker daemon is mounted. privileged = False if type(actor['privileged']) == bool and actor['privileged']: privileged = True logger.debug("privileged: {}".format(privileged)) # retrieve the default environment registered with the actor. environment = actor['default_environment'] logger.debug("Actor default environment: {}".format(environment)) # construct the user field from the actor's uid and gid: user = get_container_user(actor) logger.debug("Final user valiue: {}".format(user)) # overlay the default_environment registered for the actor with the msg # dictionary environment.update(msg) environment['_abaco_access_token'] = '' environment['_abaco_actor_dbid'] = actor_id environment['_abaco_actor_id'] = actor.id environment['_abaco_actor_state'] = actor.state logger.debug("Overlayed environment: {}".format(environment)) # if we have an agave client, get a fresh set of tokens: if ag: try: ag.token.refresh() token = ag.token.token_info['access_token'] environment['_abaco_access_token'] = token logger.info("Refreshed the tokens. Passed {} to the environment.".format(token)) except Exception as e: logger.error("Got an exception trying to get an access token: {}".format(e)) else: logger.info("Agave client `ag` is None -- not passing access token.") logger.info("Passing update environment: {}".format(environment)) try: stats, logs, final_state, exit_code, start_time = execute_actor(actor_id, worker_id, execution_id, image, message, user, environment, privileged, mounts, leave_containers, fifo_host_path, socket_host_path) except DockerStartContainerError as e: logger.error("Got DockerStartContainerError: {}".format(e)) Actor.set_status(actor_id, ERROR, "Error executing container: {}".format(e)) continue # Add the completed stats to the execution logger.info("Actor container finished successfully. Got stats object:{}".format(str(stats))) Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats, final_state, exit_code, start_time) logger.info("Added execution: {}".format(execution_id)) # Add the logs to the execution Execution.set_logs(execution_id, logs) logger.info("Added execution logs.") # Update the worker's last updated and last execution fields: try: Worker.update_worker_execution_time(actor_id, worker_id) except KeyError: # it is possible that this worker was sent a gracful shutdown command in the other thread # and that spawner has already removed this worker from the store. logger.info("worker {} got unexpected key error trying to update its execution time. " "Worker better be shutting down! keep_running: {}".format(worker_id, keep_running)) if keep_running: logger.error("worker couldn't update's its execution time but keep_running is still true!") logger.info("worker time stamps updated.")
def subscribe(tenant, actor_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch): """ Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor containers when message arrive. Also subscribes to the worker channel for future communications. :return: """ actor_ch = ActorMsgChannel(actor_id) ag = None if api_server and client_id and client_secret and access_token and refresh_token: ag = Agave(api_server=api_server, token=access_token, refresh_token=refresh_token, api_key=client_id, api_secret=client_secret) else: print("Not creating agave client.") t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, actor_ch, ag)) t.start() print("Worker subscribing to actor channel...") global keep_running while keep_running: Worker.update_worker_status(actor_id, worker_ch.name, READY) try: msg = actor_ch.get(timeout=2) except channelpy.ChannelTimeoutException: continue except channelpy.ChannelClosedException: print("Channel closed, worker exiting...") keep_running = False sys.exit() print("Received message {}. Starting actor container...".format(str(msg))) # the msg object is a dictionary with an entry called message and an arbitrary # set of k:v pairs coming in from the query parameters. message = msg.pop('message', '') actor = Actor.from_db(actors_store[actor_id]) execution_id = msg['_abaco_execution_id'] privileged = False if actor['privileged'] == 'TRUE': privileged = True environment = actor['default_environment'] print("Actor default environment: {}".format(environment)) print("Actor privileged: {}".format(privileged)) # overlay the default_environment registered for the actor with the msg # dictionary environment.update(msg) environment['_abaco_access_token'] = '' # if we have an agave client, get a fresh set of tokens: if ag: try: ag.token.refresh() token = ag.token.token_info['access_token'] environment['_abaco_access_token'] = token print("Refreshed the tokens. Passed {} to the environment.".format(token)) except Exception as e: print("Got an exception trying to get an access token: {}".format(e)) else: print("Agave client `ag` is None -- not passing access token.") print("Passing update environment: {}".format(environment)) try: stats, logs = execute_actor(actor_id, worker_ch, image, message, environment, privileged) except DockerStartContainerError as e: print("Got DockerStartContainerError: {}".format(str(e))) Actor.set_status(actor_id, ERROR) continue # add the execution to the actor store print("Actor container finished successfully. Got stats object:{}".format(str(stats))) Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats) print("Added execution: {}".format(execution_id)) Execution.set_logs(execution_id, logs) Worker.update_worker_execution_time(actor_id, worker_ch.name)
def test_patch_execution_preprocessor(self, mock_restart_execution, mock_executor_stop, mock_executor_cancel, mock_executor_add, mock_executor_remove): """ It test the correct work of the method of canceling an execution """ # First we verify that nothing happens if launch_execution = False data = {'status': 'PEPITO'} response = self.client.patch("/api/v1/executions/100", data=json.dumps(data), content_type='application/json') self.assertEquals(409, response.status_code) self.assertEquals( 'No execution by the given id', response.json['message']) # Preparing the data for the rest of the test testbed = Testbed("name", False, "slurm", "ssh", "user@server", ['slurm']) db.session.add(testbed) db.session.commit() application = Application() application.name = "xxx" application.application_type = "XXX" db.session.add(application) db.session.commit() execution_configuration = ExecutionConfiguration() execution_configuration.testbed = testbed execution_configuration.application = application db.session.add(execution_configuration) db.session.commit() execution = Execution() execution.execution_type = Executable.__type_singularity_srun__ execution.status = Execution.__status_running__ execution.execution_configuration = execution_configuration db.session.add(execution) db.session.commit() response = self.client.patch("/api/v1/executions/" + str(execution.id) , data=json.dumps(data), content_type='application/json') self.assertEquals(409, response.status_code) self.assertEquals( 'No valid state to try to change', response.json['message']) data = {'PEPITO': 'PEPITO'} response = self.client.patch("/api/v1/executions/" + str(execution.id) , data=json.dumps(data), content_type='application/json') self.assertEquals(409, response.status_code) self.assertEquals( 'No status, remove_resource, or add_resource field in the payload', response.json['message']) data = {'status': 'CANCEL'} response = self.client.patch("/api/v1/executions/" + str(execution.id) , data=json.dumps(data), content_type='application/json') self.assertEquals(200, response.status_code) mock_executor_cancel.assert_called_with(execution, 'user@server') data = {'add_resource': ''} response = self.client.patch("/api/v1/executions/" + str(execution.id) , data=json.dumps(data), content_type='application/json') mock_executor_add.assert_called_with(execution) data = {'remove_resource': ''} response = self.client.patch("/api/v1/executions/" + str(execution.id) , data=json.dumps(data), content_type='application/json') mock_executor_remove.assert_called_with(execution) # Adding Checkpointable changes of status at ALDE level. execution.status = Execution.__status_running__ application.application_type = Application.CHECKPOINTABLE db.session.commit() data = {'status': 'STOP'} response = self.client.patch("/api/v1/executions/" + str(execution.id), data=json.dumps(data), content_type="application/json") mock_executor_stop.assert_called_with(execution) execution.status = Execution.__status_cancel__ db.session.commit() response = self.client.patch("/api/v1/executions/" + str(execution.id), data=json.dumps(data), content_type="application/json") self.assertEquals(409, response.status_code) self.assertEquals( 'Execution is not in right state', response.json['message']) # Checkpointable restart execution.status = Execution.__status_stopped__ db.session.commit() data = {'status': 'RESTART'} response = self.client.patch("/api/v1/executions/" + str(execution.id), data=json.dumps(data), content_type="application/json") mock_restart_execution.assert_called_with(execution) execution.status = Execution.__status_cancel__ db.session.commit() response = self.client.patch("/api/v1/executions/" + str(execution.id), data=json.dumps(data), content_type="application/json") self.assertEquals(409, response.status_code) self.assertEquals( 'Execution is not in right state', response.json['message'])
def post(self, actor_id): def get_hypermedia(actor, exc): return {'_links': {'self': '{}/actors/v2/{}/executions/{}'.format(actor.api_server, actor.id, exc), 'owner': '{}/profiles/v2/{}'.format(actor.api_server, actor.owner), 'messages': '{}/actors/v2/{}/messages'.format(actor.api_server, actor.id)},} logger.debug("top of POST /actors/{}/messages.".format(actor_id)) dbid = Actor.get_dbid(g.tenant, actor_id) try: Actor.from_db(actors_store[dbid]) except KeyError: logger.debug("did not find actor: {}.".format(actor_id)) raise ResourceError("No actor found with id: {}.".format(actor_id), 404) args = self.validate_post() d = {} # build a dictionary of k:v pairs from the query parameters, and pass a single # additional object 'message' from within the post payload. Note that 'message' # need not be JSON data. logger.debug("POST body validated. actor: {}.".format(actor_id)) for k, v in request.args.items(): if k == 'message': continue d[k] = v logger.debug("extra fields added to message from query parameters: {}.".format(d)) if hasattr(g, 'user'): d['_abaco_username'] = g.user logger.debug("_abaco_username: {} added to message.".format(g.user)) if hasattr(g, 'api_server'): d['_abaco_api_server'] = g.api_server logger.debug("_abaco_api_server: {} added to message.".format(g.api_server)) # if hasattr(g, 'jwt'): # d['_abaco_jwt'] = g.jwt # if hasattr(g, 'jwt_server'): # d['_abaco_jwt_server'] = g.jwt_server if hasattr(g, 'jwt_header_name'): d['_abaco_jwt_header_name'] = g.jwt_header_name logger.debug("abaco_jwt_header_name: {} added to message.".format(g.jwt_header_name)) # create an execution exc = Execution.add_execution(dbid, {'cpu': 0, 'io': 0, 'runtime': 0, 'status': SUBMITTED, 'executor': g.user}) logger.info("Execution {} added for actor {}".format(exc, actor_id)) d['_abaco_execution_id'] = exc d['_abaco_Content_Type'] = args.get('_abaco_Content_Type', '') logger.debug("Final message dictionary: {}".format(d)) ch = ActorMsgChannel(actor_id=dbid) ch.put_msg(message=args['message'], d=d) ch.close() logger.debug("Message added to actor inbox. id: {}.".format(actor_id)) # make sure at least one worker is available actor = Actor.from_db(actors_store[dbid]) actor.ensure_one_worker() logger.debug("ensure_one_worker() called. id: {}.".format(actor_id)) if args.get('_abaco_Content_Type') == 'application/octet-stream': result = {'execution_id': exc, 'msg': 'binary - omitted'} else: result={'execution_id': exc, 'msg': args['message']} result.update(get_hypermedia(actor, exc)) case = Config.get('web', 'case') if not case == 'camel': return ok(result) else: return ok(dict_to_camel(result))
def subscribe(tenant, actor_id, worker_id, api_server, client_id, client_secret, access_token, refresh_token, worker_ch): """ Main loop for the Actor executor worker. Subscribes to the actor's inbox and executes actor containers when message arrive. Also subscribes to the worker channel for future communications. :return: """ logger.debug("Top of subscribe().") actor_ch = ActorMsgChannel(actor_id) ag = None if api_server and client_id and client_secret and access_token and refresh_token: logger.info("Creating agave client.") ag = Agave(api_server=api_server, token=access_token, refresh_token=refresh_token, api_key=client_id, api_secret=client_secret) else: logger.info("Not creating agave client.") logger.info("Starting the process worker channel thread.") t = threading.Thread(target=process_worker_ch, args=(tenant, worker_ch, actor_id, worker_id, actor_ch, ag)) t.start() logger.info("Worker subscribing to actor channel.") update_worker_status = True global keep_running while keep_running: if update_worker_status: Worker.update_worker_status(actor_id, worker_id, READY) update_worker_status = False try: msg = actor_ch.get(timeout=2) except channelpy.ChannelTimeoutException: continue except channelpy.ChannelClosedException: logger.info("Channel closed, worker exiting...") keep_running = False sys.exit() update_worker_status = True logger.info( "Received message {}. Starting actor container...".format(msg)) # the msg object is a dictionary with an entry called message and an arbitrary # set of k:v pairs coming in from the query parameters. message = msg.pop('message', '') actor = Actor.from_db(actors_store[actor_id]) execution_id = msg['_abaco_execution_id'] # the execution object was created by the controller, but we need to add the worker id to it now that we # know which worker will be working on the execution. logger.debug("Adding worker_id to execution.") Execution.add_worker_id(actor_id, execution_id, worker_id) # privileged dictates whether the actor container runs in privileged mode and if docker daemon is mounted. privileged = False if actor['privileged'] == 'TRUE': privileged = True logger.debug("privileged: {}".format(privileged)) # retrieve the default environment registered with the actor. environment = actor['default_environment'] logger.debug("Actor default environment: {}".format(environment)) # overlay the default_environment registered for the actor with the msg # dictionary environment.update(msg) environment['_abaco_access_token'] = '' environment['_abaco_actor_dbid'] = actor_id environment['_abaco_actor_id'] = actor.id environment['_abaco_actor_state'] = actor.state logger.debug("Overlayed environment: {}".format(environment)) # if we have an agave client, get a fresh set of tokens: if ag: try: ag.token.refresh() token = ag.token.token_info['access_token'] environment['_abaco_access_token'] = token logger.info( "Refreshed the tokens. Passed {} to the environment.". format(token)) except Exception as e: logger.error( "Got an exception trying to get an access token: {}". format(e)) else: logger.info( "Agave client `ag` is None -- not passing access token.") logger.info("Passing update environment: {}".format(environment)) try: stats, logs, final_state, exit_code = execute_actor( actor_id, worker_id, worker_ch, image, message, environment, privileged) except DockerStartContainerError as e: logger.error("Got DockerStartContainerError: {}".format(str(e))) Actor.set_status(actor_id, ERROR) continue # Add the completed stats to the execution logger.info( "Actor container finished successfully. Got stats object:{}". format(str(stats))) Execution.finalize_execution(actor_id, execution_id, COMPLETE, stats, final_state, exit_code) logger.info("Added execution: {}".format(execution_id)) # Add the logs to the execution Execution.set_logs(execution_id, logs) logger.info("Added execution logs.") # Update the worker's last updated and last execution fields: Worker.update_worker_execution_time(actor_id, worker_id) logger.info("worker time stamps updated.")
def test_execute_application_type_torque_qsub(self, mock_shell, mock_add_nodes): """ It verifies that the application type slurm sbatch is executed """ # First we verify that the testbed is of type TORQUE to be able # to execute it, in this case it should give an error since it is # not of type torque # We define the different entities necessary for the test. testbed = Testbed( name="nova2", on_line=True, category="xxxx", protocol="SSH", endpoint="*****@*****.**", package_formats=['sbatch', 'SINGULARITY'], extra_config={ "enqueue_compss_sc_cfg": "nova.cfg", "enqueue_env_file": "/home_nfs/home_ejarquej/installations/rc1707/COMPSs/compssenv" }) db.session.add(testbed) application = Application(name="super_app") db.session.add(application) db.session.commit() # So application and testbed get an id executable = Executable() executable.compilation_type = Executable.__type_torque_qsub__ executable.executable_file = "pepito.sh" db.session.add(executable) db.session.commit() # We do this so executable gets and id deployment = Deployment() deployment.testbed_id = testbed.id deployment.executable_id = executable.id db.session.add( deployment) # We add the executable to the db so it has an id execution_config = ExecutionConfiguration() execution_config.execution_type = Executable.__type_torque_qsub__ execution_config.application = application execution_config.testbed = testbed execution_config.executable = executable db.session.add(execution_config) db.session.commit() execution = Execution() execution.execution_type = Executable.__type_torque_qsub__ execution.status = Execution.__status_submitted__ torque.execute_batch(execution, execution_config.id) self.assertEquals(Execution.__status_failed__, execution.status) self.assertEquals("Testbed does not support TORQUE:QSUB applications", execution.output) # If the testbed is off-line, execution isn't allowed also testbed.category = Testbed.torque_category testbed.on_line = False db.session.commit() execution = Execution() execution.execution_type = Executable.__type_torque_qsub__ execution.status = Execution.__status_submitted__ torque.execute_batch(execution, execution_config.id) self.assertEquals(Executable.__type_torque_qsub__, execution.execution_type) self.assertEquals(Execution.__status_failed__, execution.status) self.assertEquals("Testbed is off-line", execution.output) ## Test executing output = b'1208.cloudserver' mock_shell.return_value = output testbed.category = Testbed.torque_category testbed.on_line = True db.session.commit() execution = Execution() execution.execution_type = Executable.__type_torque_qsub__ execution.status = Execution.__status_submitted__ torque.execute_batch(execution, execution_config.id) mock_shell.assert_called_with("qsub", "*****@*****.**", ["pepito.sh"]) execution = db.session.query(Execution).filter_by( execution_configuration_id=execution_config.id).first() self.assertEqual(execution.execution_type, execution_config.execution_type) self.assertEqual(execution.status, Execution.__status_running__) self.assertEqual("1208.cloudserver", execution.batch_id)
def execute_actor(actor_id, worker_id, execution_id, image, msg, user=None, d={}, privileged=False, mounts=[], leave_container=False, fifo_host_path=None, socket_host_path=None, mem_limit=None, max_cpus=None): """ Creates and runs an actor container and supervises the execution, collecting statistics about resource consumption from the Docker daemon. :param actor_id: the dbid of the actor; for updating worker status :param worker_id: the worker id; also for updating worker status :param execution_id: the id of the execution. :param image: the actor's image; worker must have already downloaded this image to the local docker registry. :param msg: the message being passed to the actor. :param user: string in the form {uid}:{gid} representing the uid and gid to run the command as. :param d: dictionary representing the environment to instantiate within the actor container. :param privileged: whether this actor is "privileged"; i.e., its container should run in privileged mode with the docker daemon mounted. :param mounts: list of dictionaries representing the mounts to add; each dictionary mount should have 3 keys: host_path, container_path and format (which should have value 'ro' or 'rw'). :param fifo_host_path: If not None, a string representing a path on the host to a FIFO used for passing binary data to the actor. :param socket_host_path: If not None, a string representing a path on the host to a socket used for collecting results from the actor. :param mem_limit: The maximum amount of memory the Actor container can use; should be the same format as the --memory Docker flag. :param max_cpus: The maximum number of CPUs each actor will have available to them. Does not guarantee these CPU resources; serves as upper bound. :return: result (dict), logs (str) - `result`: statistics about resource consumption; `logs`: output from docker logs. """ logger.debug("top of execute_actor(); (worker {};{})".format( worker_id, execution_id)) # initially set the global force_quit variable to False globals.force_quit = False # initial stats object, environment, binds and volumes result = {'cpu': 0, 'io': 0, 'runtime': 0} # instantiate docker client cli = docker.APIClient(base_url=dd, version="auto") # don't try to pass binary messages through the environment as these can cause # broken pipe errors. the binary data will be passed through the FIFO momentarily. if not fifo_host_path: d['MSG'] = msg binds = {} volumes = [] # if container is privileged, mount the docker daemon so that additional # containers can be started. logger.debug("privileged: {};(worker {};{})".format( privileged, worker_id, execution_id)) if privileged: binds = { '/var/run/docker.sock': { 'bind': '/var/run/docker.sock', 'ro': False } } volumes = ['/var/run/docker.sock'] # add a bind key and dictionary as well as a volume for each mount for m in mounts: binds[m.get('host_path')] = { 'bind': m.get('container_path'), 'ro': m.get('format') == 'ro' } volumes.append(m.get('host_path')) # mem_limit # -1 => unlimited memory if mem_limit == '-1': mem_limit = None # max_cpus try: max_cpus = int(max_cpus) except: max_cpus = None # -1 => unlimited cpus if max_cpus == -1: max_cpus = None host_config = cli.create_host_config(binds=binds, privileged=privileged, mem_limit=mem_limit, nano_cpus=max_cpus) logger.debug("host_config object created by (worker {};{}).".format( worker_id, execution_id)) # write binary data to FIFO if it exists: if fifo_host_path: try: fifo = os.open(fifo_host_path, os.O_RDWR) os.write(fifo, msg) except Exception as e: logger.error( "Error writing the FIFO. Exception: {};(worker {};{})".format( e, worker_id, execution_id)) os.remove(fifo_host_path) raise DockerStartContainerError("Error writing to fifo: {}; " "(worker {};{})".format( e, worker_id, execution_id)) # set up results socket ----------------------- # make sure socket doesn't already exist: try: os.unlink(socket_host_path) except OSError as e: if os.path.exists(socket_host_path): logger.error( "socket at {} already exists; Exception: {}; (worker {};{})". format(socket_host_path, e, worker_id, execution_id)) raise DockerStartContainerError( "Got an OSError trying to create the results docket; " "exception: {}".format(e)) # use retry logic since, when the compute node is under load, we see errors initially trying to create the socket # server object. keep_trying = True count = 0 server = None while keep_trying and count < 10: keep_trying = False count = count + 1 try: server = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM) except Exception as e: keep_trying = True logger.info("Could not instantiate socket at {}. " "Count: {}; Will keep trying. " "Exception: {}; type: {}; (worker {};{})".format( socket_host_path, count, e, type(e), worker_id, execution_id)) try: server.bind(socket_host_path) except Exception as e: keep_trying = True logger.info("Could not bind socket at {}. " "Count: {}; Will keep trying. " "Exception: {}; type: {}; (worker {};{})".format( socket_host_path, count, e, type(e), worker_id, execution_id)) try: os.chmod(socket_host_path, 0o777) logger.debug( "results socket permissions set to 777. socket_host_path: {}". format(socket_host_path)) except Exception as e: msg = f"Got exception trying to set permissions on the results socket. Not sure what to do. e: {e}" logger.error(msg) # for now, we'll just swallow it but this is really a TODO. try: server.settimeout(RESULTS_SOCKET_TIMEOUT) except Exception as e: keep_trying = True logger.info("Could not set timeout for socket at {}. " "Count: {}; Will keep trying. " "Exception: {}; type: {}; (worker {};{})".format( socket_host_path, count, e, type(e), worker_id, execution_id)) if not server: msg = "Failed to instantiate results socket. " \ "Abaco compute host could be overloaded. Exception: {}; (worker {};{})".format(e, worker_id, execution_id) logger.error(msg) raise DockerStartContainerError(msg) logger.debug( "results socket server instantiated. path: {} (worker {};{})".format( socket_host_path, worker_id, execution_id)) # instantiate the results channel: results_ch = ExecutionResultsChannel(actor_id, execution_id) # create and start the container logger.debug("Final container environment: {};(worker {};{})".format( d, worker_id, execution_id)) logger.debug( "Final binds: {} and host_config: {} for the container.(worker {};{})". format(binds, host_config, worker_id, execution_id)) container = cli.create_container(image=image, environment=d, user=user, volumes=volumes, host_config=host_config) # get the UTC time stamp start_time = get_current_utc_time() # start the timer to track total execution time. start = timeit.default_timer() logger.debug("right before cli.start: {}; container id: {}; " "(worker {};{})".format(start, container.get('Id'), worker_id, execution_id)) try: cli.start(container=container.get('Id')) except Exception as e: # if there was an error starting the container, user will need to debug logger.info( "Got exception starting actor container: {}; (worker {};{})". format(e, worker_id, execution_id)) raise DockerStartContainerError( "Could not start container {}. Exception {}".format( container.get('Id'), str(e))) # local bool tracking whether the actor container is still running running = True Execution.update_status(actor_id, execution_id, RUNNING) logger.debug("right before creating stats_cli: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) # create a separate cli for checking stats objects since these should be fast and we don't want to wait stats_cli = docker.APIClient(base_url=dd, timeout=1, version="auto") logger.debug("right after creating stats_cli: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) # under load, we can see UnixHTTPConnectionPool ReadTimeout's trying to create the stats_obj # so here we are trying up to 3 times to create the stats object for a possible total of 3s # timeouts ct = 0 stats_obj = None logs = None while ct < 3: try: stats_obj = stats_cli.stats(container=container.get('Id'), decode=True) break except ReadTimeout: ct += 1 except Exception as e: logger.error( "Unexpected exception creating stats_obj. Exception: {}; (worker {};{})" .format(e, worker_id, execution_id)) # in this case, we need to kill the container since we cannot collect stats; # UPDATE - 07-2018: under load, a errors can occur attempting to create the stats object. # the container could still be running; we need to explicitly check the container status # to be sure. logger.debug( "right after attempting to create stats_obj: {}; (worker {};{})". format(timeit.default_timer(), worker_id, execution_id)) # a counter of the number of iterations through the main "running" loop; # this counter is used to determine when less frequent actions, such as log aggregation, need to run. loop_idx = 0 while running and not globals.force_quit: loop_idx += 1 logger.debug( "top of while running loop; loop_idx: {}".format(loop_idx)) datagram = None stats = None try: datagram = server.recv(MAX_RESULT_FRAME_SIZE) except socket.timeout: pass except Exception as e: logger.error( "got exception from server.recv: {}; (worker {};{})".format( e, worker_id, execution_id)) logger.debug( "right after try/except datagram block: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) if datagram: try: results_ch.put(datagram) except Exception as e: logger.error( "Error trying to put datagram on results channel. " "Exception: {}; (worker {};{})".format( e, worker_id, execution_id)) logger.debug("right after results ch.put: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) # only try to collect stats if we have a stats_obj: if stats_obj: logger.debug( "we have a stats_obj; trying to collect stats. (worker {};{})". format(worker_id, execution_id)) try: logger.debug( "waiting on a stats obj: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) stats = next(stats_obj) logger.debug("got the stats obj: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) except StopIteration: # we have read the last stats object - no need for processing logger.debug( "Got StopIteration; no stats object. (worker {};{})". format(worker_id, execution_id)) except ReadTimeoutError: # this is a ReadTimeoutError from docker, not requests. container is finished. logger.info( "next(stats) just timed out: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) # UPDATE - 07-2018: under load, a ReadTimeoutError from the attempt to get a stats object # does NOT imply the container has stopped; we need to explicitly check the container status # to be sure. # if we got a stats object, add it to the results; it is possible stats collection timed out and the object # is None if stats: logger.debug("adding stats to results; (worker {};{})".format( worker_id, execution_id)) try: result['cpu'] += stats['cpu_stats']['cpu_usage']['total_usage'] except KeyError as e: logger.info( "Got a KeyError trying to fetch the cpu object: {}; " "(worker {};{})".format(e, worker_id, execution_id)) try: result['io'] += stats['networks']['eth0']['rx_bytes'] except KeyError as e: logger.info( "Got KeyError exception trying to grab the io object. " "running: {}; Exception: {}; (worker {};{})".format( running, e, worker_id, execution_id)) # grab the logs every 5th iteration -- if loop_idx % 5 == 0: logs = cli.logs(container.get('Id')) Execution.set_logs(execution_id, logs) logs = None # checking the container status to see if it is still running ---- if running: logger.debug( "about to check container status: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) # we need to wait for the container id to be available i = 0 while i < 10: try: c = cli.containers(all=True, filters={'id': container.get('Id')})[0] break except IndexError: logger.error( "Got an IndexError trying to get the container object. " "(worker {};{})".format(worker_id, execution_id)) time.sleep(0.1) i += 1 logger.debug( "done checking status: {}; i: {}; (worker {};{})".format( timeit.default_timer(), i, worker_id, execution_id)) # if we were never able to get the container object, we need to stop processing and kill this # worker; the docker daemon could be under heavy load, but we need to not launch another # actor container with this worker, because the existing container may still be running, if i == 10 or not c: # we'll try to stop the container logger.error( "Never could retrieve the container object! Attempting to stop container; " "container id: {}; (worker {};{})".format( container.get('Id'), worker_id, execution_id)) # stop_container could raise an exception - if so, we let it pass up and have the worker # shut itself down. stop_container(cli, container.get('Id')) logger.info("container {} stopped. (worker {};{})".format( container.get('Id'), worker_id, execution_id)) # if we were able to stop the container, we can set running to False and keep the # worker running running = False continue state = c.get('State') if not state == 'running': logger.debug( "container finished, final state: {}; (worker {};{})". format(state, worker_id, execution_id)) running = False continue else: # container still running; check if a force_quit has been sent OR # we are beyond the max_run_time runtime = timeit.default_timer() - start if globals.force_quit or (max_run_time > 0 and max_run_time < runtime): logs = cli.logs(container.get('Id')) if globals.force_quit: logger.info( "issuing force quit: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) else: logger.info( "hit runtime limit: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) cli.stop(container.get('Id')) running = False logger.debug( "right after checking container state: {}; (worker {};{})". format(timeit.default_timer(), worker_id, execution_id)) logger.info("container stopped:{}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) stop = timeit.default_timer() globals.force_quit = False # get info from container execution, including exit code; Exceptions from any of these commands # should not cause the worker to shutdown or prevent starting subsequent actor containers. try: container_info = cli.inspect_container(container.get('Id')) try: container_state = container_info['State'] try: exit_code = container_state['ExitCode'] except KeyError as e: logger.error("Could not determine ExitCode for container {}. " "Exception: {}; (worker {};{})".format( container.get('Id'), e, worker_id, execution_id)) exit_code = 'undetermined' except KeyError as e: logger.error( "Could not determine final state for container {}. " "Exception: {}; (worker {};{})".format(container.get('Id')), e, worker_id, execution_id) container_state = {'unavailable': True} except docker.errors.APIError as e: logger.error("Could not inspect container {}. " "Exception: {}; (worker {};{})".format( container.get('Id'), e, worker_id, execution_id)) logger.debug( "right after getting container_info: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) # get logs from container if not logs: logs = cli.logs(container.get('Id')) if not logs: # there are issues where container do not have logs associated with them when they should. logger.info("Container id {} had NO logs associated with it. " "(worker {};{})".format(container.get('Id'), worker_id, execution_id)) logger.debug( "right after getting container logs: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) # get any additional results from the execution: while True: datagram = None try: datagram = server.recv(MAX_RESULT_FRAME_SIZE) except socket.timeout: break except Exception as e: logger.error( "Got exception from server.recv: {}; (worker {};{})".format( e, worker_id, execution_id)) if datagram: try: results_ch.put(datagram) except Exception as e: logger.error( "Error trying to put datagram on results channel. " "Exception: {}; (worker {};{})".format( e, worker_id, execution_id)) logger.debug( "right after getting last execution results from datagram socket: {}; " "(worker {};{})".format(timeit.default_timer(), worker_id, execution_id)) if socket_host_path: server.close() os.remove(socket_host_path) logger.debug("right after removing socket: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) # remove actor container with retrying logic -- check for specific filesystem errors from the docker daemon: if not leave_container: keep_trying = True count = 0 while keep_trying and count < 10: keep_trying = False count = count + 1 try: cli.remove_container(container=container) logger.info("Actor container removed. (worker {};{})".format( worker_id, execution_id)) except Exception as e: # if the container is already gone we definitely want to quit: if 'No such container' in str(e): logger.info("Got 'no such container' exception - quiting. " "Exception: {}; (worker {};{})".format( e, worker_id, execution_id)) break # if we get a resource busy/internal server error from docker, we need to keep trying to remove the # container. elif 'device or resource busy' in str( e) or 'failed to remove root filesystem' in str(e): logger.error( "Got resource busy/failed to remove filesystem exception trying to remove " "actor container; will keep trying." "Count: {}; Exception: {}; (worker {};{})".format( count, e, worker_id, execution_id)) time.sleep(1) keep_trying = True else: logger.error( "Unexpected exception trying to remove actor container. Giving up." "Exception: {}; type: {}; (worker {};{})".format( e, type(e), worker_id, execution_id)) else: logger.debug("leaving actor container since leave_container was True. " "(worker {};{})".format(worker_id, execution_id)) logger.debug( "right after removing actor container: {}; (worker {};{})".format( timeit.default_timer(), worker_id, execution_id)) if fifo_host_path: os.close(fifo) os.remove(fifo_host_path) if results_ch: results_ch.close() result['runtime'] = int(stop - start) logger.debug( "right after removing fifo; about to return: {}; (worker {};{})". format(timeit.default_timer(), worker_id, execution_id)) return result, logs, container_state, exit_code, start_time
def getAllExecution(self, page_id): values = Execution.select().paginate(page_id, 2) arrayOfData = [model_to_dict(book) for book in values] return arrayOfData