def update_bashrc(self): """Add AKRR enviroment variables to .bashrc""" log.info("Updating .bashrc") akrr_header = '#AKRR Server Environment Variables' akrr_bash_content_new = list() akrr_bash_content_new.append("\n" + akrr_header + " [Start]\n") if _in_src_install: akrr_bash_content_new.append( "export PATH=\"{0}:$PATH\"\n".format(_akrr_bin_dir)) if akrr.get_akrr_dirs( self.akrr_home_dir )['akrr_home_type'] == akrr.AKRRHomeType.in_env_path: # i.e. non standard AKRR home location akrr_bash_content_new.append( "export AKRR_HOME=\"{0}\"\n".format(_akrr_home)) akrr_bash_content_new.append(akrr_header + " [End]\n\n") if len(akrr_bash_content_new) > 2: if os.path.exists(os.path.expanduser("~/.bashrc")): log.info( "Updating AKRR record in $HOME/.bashrc, backing to $HOME/.bashrc.akrr_back" ) if not akrr.dry_run: subprocess.call("cp ~/.bashrc ~/.bashrc.akrr_back", shell=True) bash_content_new = [] with open(os.path.expanduser('~/.bashrc'), 'r') as f: bashcontent = f.readlines() in_akrr = False akrr_added = False for line in bashcontent: if line.count(akrr_header + ' [Start]') > 0: in_akrr = True if not akrr_added: bash_content_new += akrr_bash_content_new akrr_added = True if not in_akrr: bash_content_new.append(line) if line.count(akrr_header + ' [End]') > 0: in_akrr = False if not akrr_added: bash_content_new += akrr_bash_content_new else: bash_content_new = akrr_bash_content_new if not akrr.dry_run: with open(os.path.expanduser('~/.bashrc'), 'w') as f: for line in bash_content_new: f.write(line) log.info("Appended AKRR records to $HOME/.bashrc") else: log.debug("New .bashrc should be like:\n" + "".join(bash_content_new)) else: log.info( "AKRR is in standard location, no updates to $HOME/.bashrc")
def run(self, akrr_db: str = None, ak_db: str = None, xd_db: str = None, install_cron_scripts: bool = True, stand_alone: bool = False, akrr_home: str = None, generate_db_only: bool = False, update: bool = False, old_akrr_home: str = None, skip_update_completed_dirs=False, skip_update_db=False, skip_saving_db_for_update=False): """ Setup or update AKRR Parameters ---------- akrr_db: if none will use localhost:3306 ak_db: if none will use ak_db xd_db: if none will use xd_db install_cron_scripts: install cron scripts stand_alone: run without XDMoD update: update current akrr installation akrr_home: custom location of akrr home generate_db_only: only generate DB update: perform update from previous stable version old_akrr_home: location of old AKRR home for update """ hints_to_finish_update = "" if update: self.update = akrr.update.UpdateAKRR(old_akrr_home) # Set initial db conf if not update: if akrr_db is None: akrr_db = self.default_akrr_db # if ak_db and xd_db is not set use akrr_db if ak_db is None: ak_db = akrr_db if xd_db is None: xd_db = akrr_db else: if akrr_db is None: # i.e. not set, use default akrr_db = set_user_password_host_port_db( self.update.old_cfg['akrr_db_user'], self.update.old_cfg['akrr_db_passwd'], self.update.old_cfg['akrr_db_host'], self.update.old_cfg['akrr_db_port'], self.update.old_cfg['akrr_db_name']) if ak_db is None: ak_db = set_user_password_host_port_db( self.update.old_cfg['ak_db_user'], self.update.old_cfg['ak_db_passwd'], self.update.old_cfg['ak_db_host'], self.update.old_cfg['ak_db_port'], self.update.old_cfg['ak_db_name']) if xd_db is None: xd_db = set_user_password_host_port_db( self.update.old_cfg['xd_db_user'], self.update.old_cfg['xd_db_passwd'], self.update.old_cfg['xd_db_host'], self.update.old_cfg['xd_db_port'], self.update.old_cfg['xd_db_name']) # Get db details self.akrr_db_user_name, self.akrr_db_user_password, self.akrr_db_host, self.akrr_db_port, self.akrr_db_name = \ get_user_password_host_port_db(akrr_db, default_database="mod_akrr") self.ak_db_user_name, self.ak_db_user_password, self.ak_db_host, self.ak_db_port, self.ak_db_name = \ get_user_password_host_port_db(ak_db, default_database="mod_appkernel") self.xd_db_user_name, self.xd_db_user_password, self.xd_db_host, self.xd_db_port, self.xd_db_name = \ get_user_password_host_port_db(xd_db, default_database="modw") self.stand_alone = stand_alone self.generate_db_only = generate_db_only self.install_cron_scripts_flag = install_cron_scripts self.akrr_home_dir = akrr_home # check self.check_utils() # get directories layout global _akrr_dirs, _akrr_home, _akrr_cfg self._initial_akrr_dirs = _akrr_dirs self._akrr_dirs = akrr.get_akrr_dirs(self.akrr_home_dir) _akrr_dirs = self._akrr_dirs _akrr_home = _akrr_dirs["akrr_home"] _akrr_cfg = _akrr_dirs["akrr_cfg"] if self.update: # require that old and new akrr home was different if _akrr_dirs == self.update.old_akrr_home: log.error( "Old and new akrr home directories should be different. Rename old akrr home.\n" + "\tOld AKRR home: %s\n\tNew AKRR home: %s", self.update.old_akrr_home, _akrr_dirs) exit(1) # shut down old daemon, remove it from cron and update DB self.update.remove_old_akrr_from_crontab() self.update.shut_down_old_akrr() if not self.update: # check previous installation self.check_previous_installation() # set installation directory self.init_dir() if not self.update: # ask info self.read_db_user_credentials() if self.install_cron_scripts_flag and not self.generate_db_only: self.ask_cron_email() # if it is dry_run # all question are asked, this is dry run, so nothing else to do") self.init_mysql_dbs() self.generate_self_signed_certificate() cfg = self.generate_settings_file() if self.update: # copy old logs if not skip_update_completed_dirs: akrr.update.UpdateCompletedDirs( self.update.old_cfg["completed_tasks_dir"], cfg["completed_tasks_dir"]).run() # update DB if not skip_update_db: akrr.update.UpdateDataBase(self.update).update( skip_saving_db_for_update=skip_saving_db_for_update) # update config files for resources and appkernels hints_to_finish_update = akrr.update.UpdateResourceAppConfigs( self.update).update() self.set_permission_on_files() self.db_check() if not self.update: self.generate_tables() if self.generate_db_only: log.info("AKRR DB Generated") return self.update_bashrc() self.start_daemon() self.check_daemon() if self.install_cron_scripts_flag: self.install_cron_scripts() log.info("AKRR is set up and is running.") if self.update: log.warning( "Below are instructions to finish conversion " + "(shell commands, execute them manually one by one ensure correct run):\n" + hints_to_finish_update)
""" AKRR configuration """ import os import re from akrr import get_akrr_dirs from akrr.util import log, clear_from_build_in_var # load default values from .cfg_default import * # pylint: disable=wildcard-import,unused-wildcard-import # get directories locations for this installation akrr_dirs = get_akrr_dirs() in_src_install = akrr_dirs['in_src_install'] akrr_mod_dir = akrr_dirs['akrr_mod_dir'] akrr_bin_dir = akrr_dirs['akrr_bin_dir'] akrr_cli_fullpath = akrr_dirs['akrr_cli_fullpath'] akrr_cfg = akrr_dirs['akrr_cfg'] akrr_home = akrr_dirs['akrr_home'] cfg_dir = akrr_dirs['cfg_dir'] templates_dir = akrr_dirs['templates_dir'] default_dir = akrr_dirs['default_dir'] appker_repo_dir = akrr_dirs['appker_repo_dir'] from akrr.cfg_util import load_resource, load_app # Resource configurations are stored here resources = {}
sys.version) exit(1) # check openssl presence try: subprocess.check_output("which openssl", shell=True) except Exception as _e: log.error("""openssl program is not available. Install it! For example by running on CentOS sudo yum install openssl openssh-clients on Ubuntu: sudo apt-get install openssl""") raise _e _akrr_dirs = akrr.get_akrr_dirs() _akrr_mod_dir = _akrr_dirs['akrr_mod_dir'] _akrr_bin_dir = _akrr_dirs['akrr_bin_dir'] _in_src_install = _akrr_dirs['in_src_install'] # _akrr_home and _akrr_cfg might be later change during setup # In the beginning it will indicate previous installation _akrr_home: Optional[str] = _akrr_dirs["akrr_home"] _akrr_cfg: Optional[str] = _akrr_dirs["akrr_cfg"] def _cursor_execute(cur, query, args=None): from akrr.util.sql import cursor_execute cursor_execute(cur, query, args=args, dry_run=akrr.dry_run) def _read_username_password(prompt="Enter username:",
def app_validate(resource,appkernel,nnodes,verbose=False): globals()['verbose']=verbose resource_name=resource app_name=appkernel errorCount=0 warningCount=0 log.info("Validating "+app_name+" application kernel installation on "+resource_name) from akrr import get_akrr_dirs akrr_dirs=get_akrr_dirs() default_resource_param_filename=os.path.abspath(os.path.join(akrr_dirs['default_dir'],"default.resource.conf")) resource_param_filename=os.path.abspath(os.path.join(akrr_dirs['cfg_dir'],"resources",resource_name,"resource.conf")) default_app_param_filename=os.path.abspath(os.path.join(akrr_dirs['default_dir'],"default.app.conf")) app_ker_param_filename=os.path.abspath(os.path.join(akrr_dirs['default_dir'],app_name+".app.conf")) ############################################################################################### #validating resource parameter file log.info("#"*80) log.info("Validating %s parameters from %s"%(resource_name,resource_param_filename)) if not os.path.isfile(resource_param_filename): log.error("resource parameters file (%s) do not exists!"%(resource_param_filename,)) exit() #check syntax try: tmp={} exec(compile(open(default_resource_param_filename).read(), default_resource_param_filename, 'exec'),tmp) exec(compile(open(resource_param_filename).read(), resource_param_filename, 'exec'),tmp) except Exception: log.exception("Can not load resource from """+resource_param_filename+"\n"+ "Probably invalid syntax.") exit(1) #check syntax try: tmp={} exec(compile(open(default_app_param_filename).read(), default_app_param_filename, 'exec'),tmp) exec(compile(open(app_ker_param_filename).read(), app_ker_param_filename, 'exec'),tmp) except Exception: log.exception("Can not load application kernel from """+app_ker_param_filename+"\n"+ "Probably invalid syntax") exit(1) #now we can load akrr from . import cfg from . import akrrrestclient from .resource_deploy import makeResultsSummary resource=cfg.FindResourceByName(resource_name) log.info("Syntax of %s is correct and all necessary parameters are present."%resource_param_filename,highlight="ok") app=cfg.FindAppByName(app_name) #check the presence of runScript[resource] #if resource_name not in app['runScript'] and 'default' not in app['runScript']: # logerr("Can not load application kernel from """+app_ker_param_filename+"\n"+ # "runScript['%s'] is not set"%(resource_name,)) # exit() log.info("Syntax of %s is correct and all necessary parameters are present."%app_ker_param_filename,highlight="ok") #check if AK is in DB if True: #add entry to mod_appkernel.resource dbAK,curAK=cfg.getAKDB(True) curAK.execute('''SELECT * FROM app_kernel_def WHERE ak_base_name=%s''', (app_name,)) ak_in_AKDB = curAK.fetchall() if len(ak_in_AKDB)==0: curAK.execute('''INSERT INTO app_kernel_def (name,ak_base_name,processor_unit,enabled, description, visible) VALUES(%s,%s,'node',0,%s,0);''', (app_name,app_name,app_name)) dbAK.commit() curAK.execute('''SELECT * FROM app_kernel_def WHERE ak_base_name=%s''', (app_name,)) ak_in_AKDB = curAK.fetchall()[0] #add entry to mod_akrr.resource db,cur=cfg.getDB(True) cur.execute('''SELECT * FROM app_kernels WHERE name=%s''', (app_name,)) ak_in_DB = cur.fetchall() if len(ak_in_DB)==0: cur.execute('''INSERT INTO app_kernels (id,name,enabled,nodes_list) VALUES(%s,%s,0,'1,2,4,8');''', (ak_in_AKDB['ak_def_id'],app_name)) db.commit() ############################################################################################### #connect to resource log.info("#"*80) log.info("Validating resource accessibility. Connecting to %s."%(resource['name'])) if resource['sshPrivateKeyFile']!=None and os.path.isfile(resource['sshPrivateKeyFile'])==False: log.error("Can not access ssh private key (%s)"""%(resource['sshPrivateKeyFile'],)) exit() str_io=io.StringIO() try: sys.stdout = sys.stderr = str_io rsh=cfg.sshResource(resource) sys.stdout=sys.__stdout__ sys.stderr=sys.__stderr__ except Exception as e: msg2=str_io.getvalue() msg2+="\n"+traceback.format_exc() sys.stdout=sys.__stdout__ sys.stderr=sys.__stderr__ log.error("Can not connect to """+resource['name']+"\n"+ "Probably invalid credential, see full error report below",msg2) exit() print("="*80) log.info("Successfully connected to %s\n\n"%(resource['name']),highlight="ok") ############################################################################################### log.info("Checking directory locations\n") d=resource['akrrData'] log.info("Checking: %s:%s"%(resource['remoteAccessNode'],d)) status,msg=CheckDir(rsh, d,exitOnFail=True,tryToCreate=True) log.info(msg+"\n",highlight="ok") d=resource['appKerDir'] log.info("Checking: %s:%s"%(resource['remoteAccessNode'],d)) status,msg=CheckDir(rsh, d,exitOnFail=True,tryToCreate=True) log.info(msg+"\n",highlight="ok") d=resource['networkScratch'] log.info("Checking: %s:%s"%(resource['remoteAccessNode'],d)) status,msg=CheckDir(rsh, d,exitOnFail=False,tryToCreate=False) if status==True: log.info(msg,highlight="ok") else: log.info(msg,highlight="warning") log.info("WARNING %d: network scratch might be have a different location on head node, so if it is by design it is ok"%(warningCount+1),highlight="warning") warningCount+=1 log.info("") d=resource['localScratch'] log.info("Checking: %s:%s"%(resource['remoteAccessNode'],d)) status,msg=CheckDir(rsh, d,exitOnFail=False,tryToCreate=False) if status==True: log.info(msg,highlight="ok") else: log.info(msg,highlight="warning") log.info("WARNING %d: local scratch might be have a different location on head node, so if it is by design it is ok"%(warningCount+1),highlight="warning") warningCount+=1 log.info("") #close connection we don't need it any more rsh.close(force=True) del rsh ############################################################################################### #send test job to queue log.info("#"*80) log.info("Will send test job to queue, wait till it executed and will analyze the output") print("Will use AKRR REST API at",akrrrestclient.restapi_host) #get check connection try: r = akrrrestclient.get('/scheduled_tasks') if r.status_code!=200: log.error("Can not get token for AKRR REST API ( """+akrrrestclient.restapi_host+" )\n"+ "See server response below",json.dumps(r.json(),indent=4)) exit() except Exception as e: log.error("Can not connect to AKRR REST API ( """+akrrrestclient.restapi_host+" )\n"+ "Is it running?\n"+ "See full error report below",traceback.format_exc()) exit() #check if the test job is already submitted task_id=None test_job_lock_filename=os.path.join(cfg.data_dir, resource_name + "_" + app_name + "_test_task.dat") if os.path.isfile(test_job_lock_filename): fin=open(test_job_lock_filename,"r") task_id=int(fin.readline()) fin.close() r = akrrrestclient.get('/tasks/'+str(task_id)) if r.status_code!=200: task_id=None else: log.info("\nWARNING %d: Seems this is rerun of this script, will monitor task with task_id = "%(warningCount+1)+str(task_id),highlight="warning") log.info("To submit new task delete "+test_job_lock_filename+"\n",highlight="warning") warningCount+=1 #check how old is it #submit test job if task_id==None: try: payload={'resource':resource_name, 'app':app_name, 'resource_param':"{'nnodes':%d}"%nnodes, 'task_param':"{'test_run':True}" } r = akrrrestclient.post('/scheduled_tasks', data=payload) if r.status_code!=200: log.error("Can not submit task through AKRR REST API ( """+akrrrestclient.restapi_host+" )\n"+ "See server response below",json.dumps(r.json(),indent=4)) exit() task_id=r.json()['data']['task_id'] except Exception as e: log.error("Can not submit task through AKRR REST API ( """+akrrrestclient.restapi_host+" )\n"+ "Is it still running?\n"+ "See full error report below",traceback.format_exc()) exit() #write file with tast_id fout=open(os.path.join(test_job_lock_filename),"w") print(task_id, file=fout) fout.close() log.info("\nSubmitted test job to AKRR, task_id is "+str(task_id)+"\n") #now wait till job is done msg_body0="" msg_body="" #response_json0={} #response_json=r.json() while True: t=datetime.datetime.now() #try: r = akrrrestclient.get('/tasks/'+str(task_id)) response_json=r.json() if r.status_code==200: response_json=r.json() msg_body="="*80 msg_body+="\nTast status:\n" if response_json["data"]["queue"]=="scheduled_tasks": msg_body+="Task is in scheduled_tasks queue.\n" msg_body+="It schedule to be started on"+response_json["data"]["data"]['time_to_start']+"\n" elif response_json["data"]["queue"]=="active_tasks": msg_body+="Task is in active_tasks queue.\n" msg_body+="Status: "+str(response_json["data"]["data"]['status'])+"\n" msg_body+="Status info:\n"+str(response_json["data"]["data"]['statusinfo'])+"\n" elif response_json["data"]["queue"]=="completed_tasks": msg_body+="Task is completed!\n" completed_tasks=r.json()['data']['data']['completed_tasks'] akrr_xdmod_instanceinfo=r.json()['data']['data']['akrr_xdmod_instanceinfo'] akrr_errmsg=r.json()['data']['data']['akrr_errmsg'] if verbose: msg_body+="completed_tasks table entry:\n"+pp.pformat(completed_tasks)+"\n" msg_body+="akrr_xdmod_instanceinfo table entry:\n"+pp.pformat(akrr_xdmod_instanceinfo)+"\n" msg_body+='output parsing results:\n'+akrr_xdmod_instanceinfo['body']+"\n" else: msg_body+="\tstatus: "+str(akrr_xdmod_instanceinfo['status'])+"\n" if akrr_xdmod_instanceinfo['status']==0: msg_body+="\tstatus2: "+completed_tasks['status']+"\n" msg_body+="\tstatusinfo: "+completed_tasks['statusinfo']+"\n" else: msg_body+=r.text+"\n" tail_msg="time: "+t.strftime("%Y-%m-%d %H:%M:%S") if msg_body!=msg_body0: print("\n\n"+msg_body) print(tail_msg, end=' ') sys.stdout.flush() else: print("\r"+tail_msg, end=' ') sys.stdout.flush() msg_body0=copy.deepcopy(msg_body) if response_json["data"]["queue"]=="completed_tasks": break #try to update: try: payload={'next_check_time':''} r = akrrrestclient.put('/active_tasks/'+str(task_id), data=payload) except: pass time.sleep(5) ############################################################################################### #analysing the output log.info("\n\nTest job is completed analyzing output\n",highlight="ok") r = akrrrestclient.get('/tasks/'+str(task_id)) if r.status_code!=200: log.error("Can not get information about task\n"+ "See full error report below", "AKRR server response:\n"+r.text) exit() completed_tasks=r.json()['data']['data']['completed_tasks'] akrr_xdmod_instanceinfo=r.json()['data']['data']['akrr_xdmod_instanceinfo'] akrr_errmsg=r.json()['data']['data']['akrr_errmsg'] results_summary=makeResultsSummary(verbose,resource_name,app_name,completed_tasks,akrr_xdmod_instanceinfo,akrr_errmsg) #execution was not successful if completed_tasks['status'].count("ERROR")>0: if completed_tasks['status'].count("ERROR Can not created batch job script and submit it to remote queue")>0: log.error("Can not created batch job script and/or submit it to remote queue\n"+ "See full error report below", results_summary) os.remove(test_job_lock_filename) exit() else: log.error(completed_tasks['status']+"\n"+ "See full error report below", results_summary) os.remove(test_job_lock_filename) exit() #execution was not successful if akrr_xdmod_instanceinfo['status']==0: log.error("Task execution was not successful\n"+ "See full error report below", results_summary) os.remove(test_job_lock_filename) exit() #see what is in report elm_perf = ET.fromstring(akrr_xdmod_instanceinfo['body']) elm_parameters=elm_perf.find('benchmark').find('parameters') elm_statistics=elm_perf.find('benchmark').find('statistics') log.info("\nTest kernel execution summary:",highlight="ok") print(results_summary) print() #log.info("\nThe output looks good.\n",highlight="ok") if(errorCount==0): #enabling resource for execution log.info("\nEnabling %s on %s for execution\n"%(app_name,resource_name),highlight="ok") try: result = akrrrestclient.put( '/resources/%s/on'%(resource_name,), data={'application':app_name}) if result.status_code == 200: log.info("Successfully enabled %s on %s"%(app_name,resource_name)) else: if result!=None: log.error("Can not turn-on %s on %s"%(app_name,resource_name),result.text) else: log.error("Can not turn-on %s on %s"%(app_name,resource_name)) exit(1) if True: #add entry to mod_appkernel.resource dbAK,curAK=cfg.getAKDB(True) curAK.execute('''SELECT * FROM app_kernel_def WHERE ak_base_name=%s''', (app_name,)) ak_in_AKDB = curAK.fetchall() if len(ak_in_AKDB)==0: curAK.execute('''INSERT INTO app_kernel_def (name,ak_base_name,processor_unit,enabled, description, visible) VALUES(%s,%s,'node',0,%s,0);''', (app_name,app_name,app_name)) dbAK.commit() curAK.execute('''UPDATE app_kernel_def SET enabled=1,visible=1 WHERE ak_base_name=%s''', (app_name,)) dbAK.commit() #add entry to mod_akrr.resource db,cur=cfg.getDB(True) cur.execute('''SELECT * FROM app_kernels WHERE name=%s''', (app_name,)) ak_in_DB = cur.fetchall() if len(ak_in_DB)==0: cur.execute('''INSERT INTO app_kernels (id,name,enabled,nodes_list) VALUES(%s,%s,0,'1,2,4,8');''', (ak_in_AKDB['ak_def_id'],app_name)) db.commit() cur.execute('''UPDATE app_kernels SET enabled=1 WHERE name=%s''', (app_name,)) db.commit() except: log.exception("Can not turn-on %s on %s",app_name,resource_name) exit(1) if(errorCount>0): log.error("There are %d errors, fix them.",errorCount) if(warningCount>0): log.info("\nThere are %d warnings.\nif warnings have sense (highlighted in yellow), you can move to next step!\n"%warningCount,highlight="warning") if(errorCount==0 and warningCount==0): log.info("\nDONE, you can move to next step!\n",highlight="ok") os.remove(test_job_lock_filename)