def push_to_db(self): db, cur = akrr.db.get_akrr_db() try: if hasattr(self, 'TimeJobPossiblyCompleted'): time_finished = self.TimeJobPossiblyCompleted else: time_finished = datetime.datetime.today() self.push_to_db_raw(cur, self.task_id, time_finished) db.commit() cur.close() del db self.ToDoNextString = "task_is_complete" return None except: db.rollback() db.commit() cur.close() del db self.PushToDBAttemps += 1 if self.PushToDBAttemps <= cfg.export_db_max_repeat_attempts: akrr.util.log.log_traceback( "AKRR server was not able to push to external DB.") self.status = "ERROR: Can not push to external DB, will try again" self.status_info = traceback.format_exc() return cfg.export_db_repeat_attempt_in else: akrr.util.log.log_traceback( "AKRR server was not able to push to external DB will only update local." ) self.status = "ERROR: Can not push to external DB, will try again" self.status_info = traceback.format_exc() self.ToDoNextString = "task_is_complete" return None
def push_to_db(self): db, cur = akrr.db.get_akrr_db() try: if self.TimeJobPossiblyCompleted is not None: time_finished = self.TimeJobPossiblyCompleted else: time_finished = datetime.datetime.today() self.push_to_db_raw(cur, self.task_id, time_finished) db.commit() cur.close() del db self.set_method_to_run_next("task_is_complete") return datetime.timedelta(seconds=3) except Exception as e: log.exception("Got exception in process_results_old: %s\n%s\n", e, traceback.format_exc()) db.rollback() db.commit() cur.close() del db self.PushToDBAttemps += 1 if self.PushToDBAttemps <= cfg.export_db_max_repeat_attempts: akrr.util.log.log_traceback("AKRR server was not able to push to external DB.") self.set_method_to_run_next( None, "ERROR: Can not push to external DB, will try again", traceback.format_exc()) return cfg.export_db_repeat_attempt_in else: akrr.util.log.log_traceback("AKRR server was not able to push to external DB will only update local.") self.set_method_to_run_next( "task_is_complete", "ERROR: Can not push to external DB, will try again", traceback.format_exc()) return None
def generate_resource_config(resource_id, m_resource_name, queuing_system): from akrr.util.sql import cursor_execute log.info("Initiating %s at AKRR" % (m_resource_name, )) if not akrr.dry_run: os.mkdir(os.path.join(resources_dir, m_resource_name), 0o700) file_path = os.path.abspath( os.path.join(resources_dir, m_resource_name, 'resource.conf')) global resource_cfg_filename resource_cfg_filename = file_path create_resource_config(file_path, queuing_system) # add entry to mod_appkernel.resource con_ak, cur_ak = akrr.db.get_ak_db(True) cur_ak.execute('''SELECT * FROM resource WHERE nickname=%s''', (m_resource_name, )) resource_in_ak_db = cur_ak.fetchall() if len(resource_in_ak_db) == 0: cursor_execute( cur_ak, "INSERT INTO resource (resource,nickname,description,enabled,visible,xdmod_resource_id)" "VALUES(%s,%s,%s,0,0,%s);", (m_resource_name, m_resource_name, m_resource_name, resource_id), akrr.dry_run) con_ak.commit() cur_ak.execute('''SELECT * FROM resource WHERE nickname=%s''', (m_resource_name, )) if not akrr.dry_run: resource_in_ak_db = cur_ak.fetchall() resource_id_in_ak_db = resource_in_ak_db[0]['resource_id'] else: resource_id_in_ak_db = 123 # add entry to mod_akrr.resource db, cur = akrr.db.get_akrr_db(True) cur.execute('''SELECT * FROM resources WHERE name=%s''', (m_resource_name, )) resource_in_db = cur.fetchall() if len(resource_in_db) == 0: cursor_execute( cur, '''INSERT INTO resources (id,xdmod_resource_id,name,enabled) VALUES(%s,%s,%s,%s);''', (resource_id_in_ak_db, resource_id, m_resource_name, 0), akrr.dry_run) db.commit() log.info("Resource configuration is in " + file_path)
def update_sub_tasks(self): # force to check SubTasks # stack the subtasks sub_task_info = self.get_sub_task_info() db, cur = akrr.db.get_akrr_db() for subtask_id, subtask_status, subtask_datetime_stamp, subtask_resource, \ subtask_app, subtask_task_param in sub_task_info: cur.execute( '''UPDATE active_tasks SET next_check_time=%s WHERE task_id=%s ;''', (datetime.datetime.today(), subtask_id)) db.commit() cur.close() del db
def app_validate(resource, appkernel, nnodes): from akrr.util.log import verbose resource_name = resource app_name = appkernel error_count = 0 warning_count = 0 log.info("Validating " + app_name + " application kernel installation on " + resource_name) from akrr import get_akrr_dirs akrr_dirs = get_akrr_dirs() default_resource_param_filename = os.path.abspath( os.path.join(akrr_dirs['default_dir'], "default.resource.conf")) resource_param_filename = os.path.abspath( os.path.join(akrr_dirs['cfg_dir'], "resources", resource_name, "resource.conf")) default_app_param_filename = os.path.abspath( os.path.join(akrr_dirs['default_dir'], "default.app.conf")) app_ker_param_filename = os.path.abspath( os.path.join(akrr_dirs['default_dir'], app_name + ".app.conf")) ############################################################################################### # validating resource parameter file log.info("#" * 80) log.info("Validating %s parameters from %s" % (resource_name, resource_param_filename)) if not os.path.isfile(resource_param_filename): log.error("resource parameters file (%s) do not exists!" % (resource_param_filename, )) exit(1) # check syntax try: tmp = {} exec( compile( open(default_resource_param_filename).read(), default_resource_param_filename, 'exec'), tmp) exec( compile( open(resource_param_filename).read(), resource_param_filename, 'exec'), tmp) except Exception: log.exception("Can not load resource from " "" + resource_param_filename + "\n" + "Probably invalid syntax.") exit(1) # check syntax try: tmp = {} exec( compile( open(default_app_param_filename).read(), default_app_param_filename, 'exec'), tmp) exec( compile( open(app_ker_param_filename).read(), app_ker_param_filename, 'exec'), tmp) except Exception: log.exception("Can not load application kernel from " "" + app_ker_param_filename + "\n" + "Probably invalid syntax") exit(1) # now we can load akrr from akrr import cfg from akrr import akrrrestclient from akrr.cli.resource_deploy import make_results_summary from akrr.cfg_util import load_app_default, load_app_on_resource resource = cfg.find_resource_by_name(resource_name) log.info( "Syntax of %s is correct and all necessary parameters are present." % resource_param_filename) cfg.find_app_by_name(app_name) try: app_default = load_app_default(app_name) app = load_app_on_resource(app_name, resource_name, resource, app_default) pprint.pprint(app) except Exception as e: # pylint: disable=broad-except log.exception("Exception occurred during updated app loading:" + str(e)) exit(1) log.info( "Syntax of %s is correct and all necessary parameters are present." % app_ker_param_filename) # check if AK is in DB if True: # add entry to mod_appkernel.resource db_ak, cur_ak = akrr.db.get_ak_db(True) cur_ak.execute( '''SELECT * FROM app_kernel_def WHERE ak_base_name=%s''', (app_name, )) ak_in_akdb = cur_ak.fetchall() if len(ak_in_akdb) == 0: cur_ak.execute( '''INSERT INTO app_kernel_def (name,ak_base_name,processor_unit,enabled, description, visible) VALUES(%s,%s,'node',0,%s,0);''', (app_name, app_name, app_name)) db_ak.commit() cur_ak.execute( '''SELECT * FROM app_kernel_def WHERE ak_base_name=%s''', (app_name, )) ak_in_akdb = cur_ak.fetchall()[0] # add entry to mod_akrr.resource db, cur = akrr.db.get_akrr_db(True) cur.execute('''SELECT * FROM app_kernels WHERE name=%s''', (app_name, )) ak_in_db = cur.fetchall() if len(ak_in_db) == 0: cur.execute( '''INSERT INTO app_kernels (id,name,enabled,nodes_list) VALUES(%s,%s,0,'1,2,4,8');''', (ak_in_akdb['ak_def_id'], app_name)) db.commit() ############################################################################################### # connect to resource log.info("#" * 80) log.info("Validating resource accessibility. Connecting to %s." % (resource['name'])) if resource['ssh_private_key_file'] is not None and os.path.isfile( resource['ssh_private_key_file']) is False: log.error("Can not access ssh private key (%s)" "" % (resource['ssh_private_key_file'], )) exit(1) str_io = io.StringIO() try: sys.stdout = sys.stderr = str_io # Connect to resource # Spin-up instance before ssh it if resource['batch_scheduler'].lower() == "openstack": # Start instance if it is cloud openstack_server = akrr.util.openstack.OpenStackServer( resource=resource) resource['openstack_server'] = openstack_server openstack_server.create() resource['remote_access_node'] = openstack_server.ip if resource['batch_scheduler'].lower() == "googlecloud": # Start instance if it is cloud googlecloud_server = akrr.util.googlecloud.GoogleCloudServer( resource=resource) resource['googlecloud_server'] = googlecloud_server googlecloud_server.create() resource['remote_access_node'] = googlecloud_server.ip rsh = akrr.util.ssh.ssh_resource(resource) sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ except Exception as e: msg2 = str_io.getvalue() msg2 += "\n" + traceback.format_exc() sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ msg = "Can not connect to """ + resource['name'] + "\n" + \ "Probably invalid credential, see full error report below\n" + msg2 log.error(msg) raise e print("=" * 80) log.info("Successfully connected to %s\n\n" % (resource['name'])) ############################################################################################### log.info("Checking directory locations\n") d = resource['akrr_data'] log.info("Checking: %s:%s" % (resource['remote_access_node'], d)) status, msg = check_dir(rsh, d, exit_on_fail=True, try_to_create=True) log.info(msg + "\n") d = resource['appkernel_dir'] log.info("Checking: %s:%s" % (resource['remote_access_node'], d)) status, msg = check_dir(rsh, d, exit_on_fail=True, try_to_create=True) log.info(msg + "\n") d = resource['network_scratch'] log.info("Checking: %s:%s" % (resource['remote_access_node'], d)) status, msg = check_dir(rsh, d, exit_on_fail=False, try_to_create=False) if status is True: log.info(msg) else: log.warning(msg) log.warning( ("WARNING %d: network scratch might be have a different location " + "on head node, so if it is by design it is ok") % (warning_count + 1)) warning_count += 1 log.info("") d = resource['local_scratch'] log.info("Checking: %s:%s" % (resource['remote_access_node'], d)) status, msg = check_dir(rsh, d, exit_on_fail=False, try_to_create=False) if status is True: log.info(msg) else: log.warning(msg) log.warning( ("WARNING %d: local scratch might be have a different location " + "on head node, so if it is by design it is ok") % (warning_count + 1)) warning_count += 1 log.info("") # close connection we don't need it any more rsh.close(force=True) del rsh # Delete openstack instance after tests if resource['batch_scheduler'].lower() == "openstack": # delete instance if it is cloud resource['openstack_server'].delete() resource['remote_access_node'] = None if resource['batch_scheduler'].lower() == "googlecloud": # delete instance if it is cloud resource['googlecloud_server'].delete() resource['remote_access_node'] = None ############################################################################################### # send test job to queue log.info("#" * 80) log.info( "Will send test job to queue, wait till it executed and will analyze the output" ) print("Will use AKRR REST API at", akrrrestclient.restapi_host) # get check connection try: r = akrrrestclient.get('/scheduled_tasks') if r.status_code != 200: log.error( "Can not get token for AKRR REST API ( " "" + akrrrestclient.restapi_host + " )\n" + "See server response below:\n %s", json.dumps(r.json(), indent=4)) exit(1) except Exception: log.error("Can not connect to AKRR REST API ( " "" + akrrrestclient.restapi_host + " )\n" + "Is it running?\n" + "See full error report below:\n" + traceback.format_exc()) exit(1) # check if the test job is already submitted task_id = None test_job_lock_filename = os.path.join( cfg.data_dir, resource_name + "_" + app_name + "_test_task.dat") if os.path.isfile(test_job_lock_filename): fin = open(test_job_lock_filename, "r") task_id = int(fin.readline()) fin.close() r = akrrrestclient.get('/tasks/' + str(task_id)) if r.status_code != 200: task_id = None else: log.warning( "\nWARNING %d: Seems this is rerun of this script, will monitor task with task_id = " % (warning_count + 1) + str(task_id)) log.warning("To submit new task delete " + test_job_lock_filename + "\n") warning_count += 1 # check how old is it # submit test job if task_id is None: try: payload = { 'resource': resource_name, 'app': app_name, 'resource_param': "{'nnodes':%d}" % nnodes, 'task_param': "{'test_run':True}" } r = akrrrestclient.post('/scheduled_tasks', data=payload) if r.status_code != 200: log.error( "Can not submit task through AKRR REST API ( " "" + akrrrestclient.restapi_host + " )\n" + "See server response below", json.dumps(r.json(), indent=4)) exit(1) task_id = r.json()['data']['data']['task_id'] except Exception: log.error("Can not submit task through AKRR REST API ( " "" + akrrrestclient.restapi_host + " )\n" + "Is it still running?\n" + "See full error report below:\n" + traceback.format_exc()) exit(1) # write file with tast_id fout = open(os.path.join(test_job_lock_filename), "w") print(task_id, file=fout) fout.close() log.info("\nSubmitted test job to AKRR, task_id is " + str(task_id) + "\n") # now wait till job is done msg_body0 = "" while True: t = datetime.datetime.now() # try: r = akrrrestclient.get('/tasks/' + str(task_id)) if r.status_code == 200: response_json = r.json() msg_body = "=" * 80 msg_body += "\nTast status:\n" if response_json["data"]["queue"] == "scheduled_tasks": msg_body += "Task is in scheduled_tasks queue.\n" msg_body += "It schedule to be started on " + response_json[ "data"]["data"]['time_to_start'] + "\n" elif response_json["data"]["queue"] == "active_tasks": msg_body += "Task is in active_tasks queue.\n" msg_body += "Status: " + str( response_json["data"]["data"]['status']) + "\n" msg_body += "Status info:\n" + str( response_json["data"]["data"]['status_info']) + "\n" elif response_json["data"]["queue"] == "completed_tasks": msg_body += "Task is completed!\n" completed_tasks = r.json()['data']['data']['completed_tasks'] akrr_xdmod_instanceinfo = r.json( )['data']['data']['akrr_xdmod_instanceinfo'] if verbose: msg_body += "completed_tasks table entry:\n" + pp.pformat( completed_tasks) + "\n" msg_body += "akrr_xdmod_instanceinfo table entry:\n" + pp.pformat( akrr_xdmod_instanceinfo) + "\n" msg_body += 'output parsing results:\n' + akrr_xdmod_instanceinfo[ 'body'] + "\n" else: msg_body += "\tstatus: " + str( akrr_xdmod_instanceinfo['status']) + "\n" if akrr_xdmod_instanceinfo['status'] == 0: msg_body += "\tstatus2: " + completed_tasks[ 'status'] + "\n" msg_body += "\tstatus_info: " + completed_tasks[ 'status_info'] + "\n" else: msg_body += r.text + "\n" tail_msg = "time: " + t.strftime("%Y-%m-%d %H:%M:%S") if msg_body != msg_body0: print("\n\n" + msg_body) print(tail_msg, end=' ') sys.stdout.flush() else: print("\r" + tail_msg, end=' ') sys.stdout.flush() msg_body0 = copy.deepcopy(msg_body) if response_json["data"]["queue"] == "completed_tasks": break # try to update: try: payload = {'next_check_time': ''} akrrrestclient.put('/active_tasks/' + str(task_id), data=payload) except Exception: pass time.sleep(5) ############################################################################################### # analysing the output log.info("Test job is completed analyzing output\n") r = akrrrestclient.get('/tasks/' + str(task_id)) if r.status_code != 200: log.error( "Can not get information about task\n" + "See full error report below", "AKRR server response:\n" + r.text) exit(1) completed_tasks = r.json()['data']['data']['completed_tasks'] akrr_xdmod_instanceinfo = r.json( )['data']['data']['akrr_xdmod_instanceinfo'] akrr_errmsg = r.json()['data']['data']['akrr_errmsg'] results_summary = make_results_summary(resource_name, app_name, completed_tasks, akrr_xdmod_instanceinfo, akrr_errmsg) # execution was not successful if completed_tasks['status'].count("ERROR") > 0: if completed_tasks['status'].count( "ERROR Can not created batch job script and submit it to remote queue" ) > 0: log.error( "Can not created batch job script and/or submit it to remote queue\n" + "See full error report below:\n" + results_summary) os.remove(test_job_lock_filename) exit(1) else: log.error(completed_tasks['status'] + "\n" + "See full error report below:\n" + results_summary) os.remove(test_job_lock_filename) exit(1) # execution was not successful if akrr_xdmod_instanceinfo['status'] == 0: log.error("Task execution was not successful\n" + "See full error report below:\n" + results_summary) os.remove(test_job_lock_filename) exit(1) # see what is in report elm_perf = XMLElementTree.fromstring(akrr_xdmod_instanceinfo['body']) elm_perf.find('benchmark').find('parameters') elm_perf.find('benchmark').find('statistics') log.info("\nTest kernel execution summary:") print(results_summary) print() # log.info("\nThe output looks good.\n") if error_count == 0: # enabling resource for execution log.info("\nEnabling %s on %s for execution\n" % (app_name, resource_name)) try: result = akrrrestclient.put('/resources/%s/on' % (resource_name, ), data={'application': app_name}) if result.status_code == 200: log.info("Successfully enabled %s on %s" % (app_name, resource_name)) else: if result is not None: log.error( "Can not turn-on %s on %s" % (app_name, resource_name), result.text) else: log.error("Can not turn-on %s on %s" % (app_name, resource_name)) exit(1) if True: # add entry to mod_appkernel.resource db_ak, cur_ak = akrr.db.get_ak_db(True) cur_ak.execute( '''SELECT * FROM app_kernel_def WHERE ak_base_name=%s''', (app_name, )) ak_in_akdb = cur_ak.fetchall() if len(ak_in_akdb) == 0: cur_ak.execute( "INSERT INTO app_kernel_def (name,ak_base_name,processor_unit,enabled, description, visible)" "VALUES(%s,%s,'node',0,%s,0);", (app_name, app_name, app_name)) db_ak.commit() cur_ak.execute( '''UPDATE app_kernel_def SET enabled=1,visible=1 WHERE ak_base_name=%s''', (app_name, )) db_ak.commit() # add entry to mod_akrr.resource db, cur = akrr.db.get_akrr_db(True) cur.execute('''SELECT * FROM app_kernels WHERE name=%s''', (app_name, )) ak_in_db = cur.fetchall() if len(ak_in_db) == 0: cur.execute( '''INSERT INTO app_kernels (id,name,enabled,nodes_list) VALUES(%s,%s,0,'1,2,4,8');''', (ak_in_akdb['ak_def_id'], app_name)) db.commit() cur.execute( '''UPDATE app_kernels SET enabled=1 WHERE name=%s''', (app_name, )) db.commit() except Exception: log.exception("Can not turn-on %s on %s", app_name, resource_name) exit(1) if error_count > 0: log.error("There are %d errors, fix them.", error_count) if warning_count > 0: log.warning( "\nThere are %d warnings.\nif warnings have sense (highlighted in yellow), you can move to next step!\n" % warning_count) if error_count == 0 and warning_count == 0: log.info("\nDONE, you can move to next step!\n") os.remove(test_job_lock_filename)