def append_to_bashrc(resource): # append environment variables to .bashrc log.info("\nAdding AKRR enviroment variables to resource's .bashrc!\n") if dry_run: return str_io = io.StringIO() try: sys.stdout = sys.stderr = str_io rsh = cfg.sshResource(resource) akrr_header = 'AKRR Remote Resource Environment Variables' out = cfg.sshCommand( rsh, '''if [ -e $HOME/.bashrc ] then if [[ `grep "\#''' + akrr_header + ''' \[Start\]" $HOME/.bashrc` == *"''' + akrr_header + ''' [Start]"* ]] then echo "Updating AKRR record in $HOME/.bashrc, backing to $HOME/.bashrc_akrrbak" cp $HOME/.bashrc $HOME/.bashrc_akrrbak head -n "$(( $(grep -n '\#''' + akrr_header + ''' \[Start\]' $HOME/.bashrc_akrrbak | head -n 1 | cut -d ":" -f 1) - 1 ))" $HOME/.bashrc_akrrbak > $HOME/.bashrc tail -n "+$(( $(grep -n '\#''' + akrr_header + ''' \[End\]' $HOME/.bashrc_akrrbak | head -n 1 | cut -d ":" -f 1) + 1 ))" $HOME/.bashrc_akrrbak >> $HOME/.bashrc fi fi''') log.debug2(out) out = cfg.sshCommand( rsh, ''' echo "Appending AKRR records to $HOME/.bashrc" echo "#''' + akrr_header + ''' [Start]" >> $HOME/.bashrc echo "export AKRR_NETWORK_SCRATCH=\\"''' + resource['networkScratch'] + '''\\"" >> $HOME/.bashrc echo "export AKRR_LOCAL_SCRATCH=\\"''' + resource['localScratch'] + '''\\"" >> $HOME/.bashrc echo "export AKRR_APPKER_DIR=\\"''' + resource['appKerDir'] + '''\\"" >> $HOME/.bashrc echo "export AKRR_AKRR_DIR=\\"''' + resource['akrrData'] + '''\\"" >> $HOME/.bashrc echo "#''' + akrr_header + ''' [End]" >> $HOME/.bashrc ''') log.debug2(out) rsh.close(force=True) del rsh sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ except Exception as e: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ log.critical( "Can not connect to %s\nProbably invalid credential, see full error report:\n%s", resource['name'], str_io.getvalue()) raise e
def check_shell(rsh, resource): log.info("Checking if shell is BASH\n") msg = cfg.sshCommand(rsh, "echo $BASH") if msg.count("bash") > 0: log.info("Shell is BASH\n") else: log.error( "Shell on headnode of %s is not BASH, change it to bash and try again.\n", resource['name']) exit(1)
def check_dir_simple(sh, d): """ check that directory exists and verify its accessibility return None,message if does not exists return True,message if can write there return False,message if can not write there """ dir(sh) cmd = "if [ -d \"%s\" ]\n then \n echo EXIST\n else echo DOES_NOT_EXIST\n fi" % ( d, ) msg = cfg.sshCommand(sh, cmd) if msg.find("DOES_NOT_EXIST") >= 0: return None, "Directory %s:%s does not exists!" % (sh.remotemachine, d) cmd = "echo test > " + os.path.join(d, 'akrr_test_write') # print cmd cfg.sshCommand(sh, cmd) # print msg cmd = "cat " + os.path.join(d, 'akrr_test_write') # print cmd msg = cfg.sshCommand(sh, cmd) # print msg if msg.strip() == "test": cmd = "rm " + os.path.join(d, 'akrr_test_write') cfg.sshCommand(sh, cmd) return True, "Directory exist and accessible for read/write" else: return False, "Directory %s:%s is NOT accessible for read/write!" % ( sh.remotemachine, d)
def check_dir(sh, d, exit_on_fail=True, try_to_create=True): status, msg = check_dir_simple(sh, d) if try_to_create is True and status is None: log.info("Directory %s:%s does not exists, will try to create it", sh.remotemachine, d) if not dry_run: cmd = "mkdir -p \"%s\"" % (d, ) cfg.sshCommand(sh, cmd) status, msg = check_dir_simple(sh, d) else: status, msg = (True, "Directory exist and accessible for read/write") if exit_on_fail is False: return status, msg if status is None: log.error("Directory %s:%s does not exists!", sh.remotemachine, d) exit() elif status is True: return True, msg else: log.error("Directory %s:%s is NOT accessible for read/write!", sh.remotemachine, d) exit()
def check_appsig(rsh, resource): log.info("Testing app.signature calculator on headnode\n") out = cfg.sshCommand( rsh, "%s/execs/bin/appsigcheck.sh `which md5sum`" % (resource['appKerDir'], )) if out.count("===ExeBinSignature===") > 0 and out.count("MD5:") > 0: log.info("App.signature calculator is working on headnode\n") else: if dry_run: log.dry_run("App.signature calculator is not working\n") return log.error( "App.signature calculator is not working\n" + "See full error report below\n%s", out) exit(1)
def analyse_test_job_results(task_id, resource, app_name="test"): """analysing the output""" log.info("Test job is completed analyzing output\n") test_job_lock_filename = get_test_job_lock_filename(resource, app_name) r = akrrrestclient.get('/tasks/%d' % task_id) if r.status_code != 200: log.error( "Can not get information about task\nSee full error report below\nAKRR server response:\n%s\n", r.text) exit(1) completed_tasks = r.json()['data']['data']['completed_tasks'] akrr_xdmod_instance_info = r.json( )['data']['data']['akrr_xdmod_instanceinfo'] akrr_errmsg = r.json()['data']['data']['akrr_errmsg'] results_summary = make_results_summary(resource['name'], app_name, completed_tasks, akrr_xdmod_instance_info, akrr_errmsg) if completed_tasks['status'].count("ERROR") > 0: # execution was not successful if completed_tasks['status'].count( "ERROR Can not created batch job script and submit it to remote queue" ) > 0: log.error( "Can not created batch job script and/or submit it to remote queue\nSee full error report below\n%s", results_summary) else: log.error("Status: %s\nSee full error report below\n%s", completed_tasks['status'], results_summary) os.remove(test_job_lock_filename) exit(1) if akrr_xdmod_instance_info['status'] == 0: # execution was not successful log.error( "Task execution was not successful\nSee full error report below\n%s", results_summary) os.remove(test_job_lock_filename) exit(1) # see what is in report elm_perf = xml.etree.ElementTree.fromstring( akrr_xdmod_instance_info['body']) elm_parameters = elm_perf.find('benchmark').find('parameters') elm_statistics = elm_perf.find('benchmark').find('statistics') parameters = {'RunEnv:Nodes': '', 'App:ExeBinSignature': ''} statistics = { 'Wall Clock Time': '0.0', 'Network scratch directory exists': '0', 'Network scratch directory accessible': '0', 'App kernel input exists': '0', 'Task working directory accessible': '0', 'local scratch directory accessible': '0', 'local scratch directory exists': '0', 'App kernel executable exists': '0', 'Task working directory exists': '0', 'Shell is BASH': '0' } for elm in list(elm_parameters): variable = elm.findtext('ID') if variable is not None: variable = variable.strip() value = elm.findtext('value') if value is not None: value = value.strip() units = elm.findtext('units') if units is not None: units = units.strip() if variable == 'App:ExeBinSignature' or variable == 'RunEnv:Nodes': value = os.popen('echo "%s"|base64 -d|gzip -d' % (value, )).read() log.debug2("parameter: {} = {} {}".format(variable, value, units)) parameters[variable] = value for elm in list(elm_statistics): variable = elm.findtext('ID') if variable is not None: variable = variable.strip() value = elm.findtext('value') if value is not None: value = value.strip() units = elm.findtext('units') if units is not None: units = units.strip() statistics[variable] = value log.debug2("statistic: {} = {} {}".format(variable, value, units)) files_exists = [ 'Network scratch directory exists', 'App kernel input exists', 'local scratch directory exists', 'App kernel executable exists', 'Task working directory exists' ] dirs_access = [ 'Network scratch directory accessible', 'Task working directory accessible', 'local scratch directory accessible' ] if statistics['Shell is BASH'] == '0': log.error( "Shell on compute nodes of %s is not BASH, change it to bash and try again.\n", resource['name']) log.error_count += 1 for file_exists in files_exists: if statistics[file_exists] == '0': log.error(file_exists.replace('exists', 'does not exist')) log.error_count += 1 for dirAccess in dirs_access: if statistics[dirAccess] == '0': log.error(dirAccess.replace('accessible', 'is not accessible')) log.error_count += 1 if parameters['App:ExeBinSignature'] == '': log.error( "Application signature calculator is not working, you might need to recompile it." "see application output for more hints") log.error_count += 1 # test the nodes, log to headnode and ping them if parameters['RunEnv:Nodes'] == '': log.error( "Nodes are not detected, check batchJobTemplate and setup of AKRR_NODELIST variable" ) log.error_count += 1 nodes = parameters['RunEnv:Nodes'].split() requested_nodes = eval(completed_tasks['resource_param'])['nnodes'] str_io = io.StringIO() try: sys.stdout = sys.stderr = str_io rsh = cfg.sshResource(resource) number_of_unknown_hosts = 0 for node in set(nodes): log.debug2(node) out = cfg.sshCommand(rsh, "ping -c 1 %s" % node) if out.count("unknown host") > 0: number_of_unknown_hosts += 1 rsh.close(force=True) del rsh sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ if number_of_unknown_hosts > 0: log.error( "ERROR %d: Can not ping compute nodes from head node\n" % (log.error_count + 1) + "Nodes on which test job was executed detected as " + parameters['RunEnv:Nodes'] + "\n" + "If these names does not have sense check batchJobTemplate and setup of AKRR_NODELIST " "variable in resource configuration file") log.error_count += 1 except Exception as e: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ log.critical( "Can not connect to %s\nProbably invalid credential, see full error report:\n%s", resource['name'], str_io.getvalue()) raise e # check ppn count if requested_nodes * resource['ppn'] != len(nodes): log.error( "ERROR {}: Number of requested processes (processes per node * nodes) " "do not match actual processes executed" "Either\n" " AKRR_NODELIST variable is set incorrectly\n" "Or\n" " processes per node (PPN) is wrong\n".format(log.error_count + 1)) log.error_count += 1 log.info("\nTest kernel execution summary:\n%s", results_summary) log.info("\nThe output looks good.\n")
def copy_exec_sources_and_inputs(rsh, resource): """Copy exec sources and inputs to remote resource""" log.info( "Preparing to copy application signature calculator,\n" " app. kernel input files and \n" " HPCC, IMB, IOR and Graph500 source code to remote resource\n") try: cfg.sshCommand(rsh, "cd %s" % resource['appKerDir']) out = cfg.sshCommand(rsh, "ls " + resource['appKerDir']) files_in_appker_dir = out.strip().split() if not ("inputs" in files_in_appker_dir or "inputs/" in files_in_appker_dir): log.info("Copying app. kernel input tarball to %s", resource['appKerDir']) if not dry_run: cfg.scpToResource(resource, cfg.appker_repo_dir + "/inputs.tar.gz", resource['appKerDir']) log.info("Unpacking app. kernel input files to %s/inputs", resource['appKerDir']) if not dry_run: out = cfg.sshCommand( rsh, "tar xvfz %s/inputs.tar.gz" % resource['appKerDir']) log.debug(out) out = cfg.sshCommand(rsh, "du -h %s/inputs" % resource['appKerDir']) log.debug(out) if out.count("No such file or directory") == 0: log.info("App. kernel input files are in %s/inputs\n", resource['appKerDir']) else: raise Exception("files are not copied!") else: log.warning_count += 1 log.warning( "WARNING %d: App. kernel inputs directory %s/inputs is present, assume they are correct.\n", log.warning_count, resource['appKerDir']) if not ("execs" in files_in_appker_dir or "execs/" in files_in_appker_dir): log.info( "Copying app. kernel execs tarball to %s\n" % (resource['appKerDir']) + "It contains HPCC,IMB,IOR and Graph500 source code and app.signature calculator" ) if not dry_run: cfg.scpToResource(resource, cfg.appker_repo_dir + "/execs.tar.gz", resource['appKerDir']) log.info( "Unpacking HPCC,IMB,IOR and Graph500 source code and app.signature calculator files to %s/execs", resource['appKerDir']) if not dry_run: out = cfg.sshCommand( rsh, "tar xvfz %s/execs.tar.gz" % resource['appKerDir']) log.debug(out) out = cfg.sshCommand(rsh, "df -h %s/execs" % resource['appKerDir']) log.debug(out) if out.count("No such file or directory") == 0: log.info( "HPCC,IMB,IOR and Graph500 source code and app.signature calculator are in %s/execs\n", resource['appKerDir']) else: raise Exception("files are not copied!") else: log.warning_count += 1 log.warning( "WARNING %d: App. kernel executables directory %s/execs is present, assume they are correct.", log.warning_count, resource['appKerDir']) log.warning( "It should contain HPCC,IMB,IOR and Graph500 source code and app.signature calculator\n" ) cfg.sshCommand(rsh, "rm execs.tar.gz inputs.tar.gz") except Exception as e: log.critical("Can not copy files to %s", resource['name']) raise e
def get_file_system_access_points(): global resource_name global networkScratch global localScratch global akrrData global appKerDir home_dir = cfg.sshCommand(rsh, "echo $HOME").strip() scratch_network_dir = cfg.sshCommand(rsh, "echo $SCRATCH").strip() # localScratch local_scratch_default = "/tmp" while True: log.log_input( "Enter location of local scratch (visible only to single node):") localScratch = input("[%s]" % local_scratch_default) if localScratch.strip() == "": localScratch = local_scratch_default status, msg = resource_deploy.check_dir_simple(rsh, localScratch) if status: log.info(msg) log.empty_line() break else: log.warning(msg) log.warning( 'local scratch might be have a different location on head node, so if it is by design it is ok' ) log.empty_line() break localScratch = cfg.sshCommand(rsh, "echo %s" % (localScratch, )).strip() # networkScratch network_scratch_default = "" if scratch_network_dir != "": network_scratch_default = scratch_network_dir network_scratch_visible = False while True: log.log_input( "Enter location of network scratch (visible only to all nodes)," "used for temporary storage of app kernel input/output:") if network_scratch_default != "": networkScratch = input("[%s]" % network_scratch_default) if networkScratch.strip() == "": networkScratch = network_scratch_default else: networkScratch = input("") if networkScratch == "": log.error("Incorrect value for networkScratch, try again") continue status, msg = resource_deploy.check_dir(rsh, networkScratch, exit_on_fail=False, try_to_create=True) if status: log.info(msg) network_scratch_visible = True log.empty_line() break else: log.warning(msg) break networkScratch = cfg.sshCommand(rsh, "echo %s" % (networkScratch, )).strip() # appKerDir appker_dir_default = os.path.join(home_dir, "appker", resource_name) while True: log.log_input( "Enter future location of app kernels input and executable files:") appKerDir = input("[%s]" % appker_dir_default) if appKerDir.strip() == "": appKerDir = appker_dir_default status, msg = resource_deploy.check_dir(rsh, appKerDir, exit_on_fail=False, try_to_create=True) if status: log.info(msg) log.empty_line() break else: log.error(msg) appKerDir = cfg.sshCommand(rsh, "echo %s" % (appKerDir, )).strip() # akrrData akrr_data_default = os.path.join(home_dir, "akrr_data", resource_name) if network_scratch_visible: akrr_data_default = os.path.join(networkScratch, "akrr_data", resource_name) while True: log.log_input( "Enter future locations for app kernels working directories (can or even should be on scratch space):" ) akrrData = input("[%s]" % akrr_data_default) if akrrData.strip() == "": akrrData = akrr_data_default status, msg = resource_deploy.check_dir(rsh, akrrData, exit_on_fail=False, try_to_create=True) if status: log.info(msg) log.empty_line() break else: log.error(msg) akrrData = cfg.sshCommand(rsh, "echo %s" % (akrrData, )).strip()