def connect_to_resource(resource): """connect to resource defined in resource dictionary""" log.info("Validating resource accessibility. Connecting to %s.", resource['name']) if resource['ssh_private_key_file'] is not None and os.path.isfile( resource['ssh_private_key_file']) is False: log.error("Can not access ssh private key (%s)" "", resource['ssh_private_key_file']) exit(1) str_io = io.StringIO() try: sys.stdout = sys.stderr = str_io rsh = akrr.util.ssh.ssh_resource(resource) sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ log.info("Successfully connected to %s\n", resource['name']) log.empty_line() return rsh except AkrrError: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ log.critical("Can not connect to %s\nMessage:\n%s", resource['name'], str_io.getvalue()) exit(1)
def setup(): if add_fake_modw: _add_fake_modw() # start bash shell bash = get_bash() bash.output = "" bash.timeoutMessage = 'Unexpected behavior of prep.sh (premature EOF or TIMEOUT)' bash.runcmd('which python3', printOutput=True) bash.runcmd('which ' + cfg.which_akrr, printOutput=True) akrr_home_arg = "" if cfg.default_akrr_home_dir != cfg.akrr_home_dir: akrr_home_arg = " --akrr-home " + cfg.akrr_home_dir # start akrr setup bash.startcmd(cfg.which_akrr + " setup " + dry_run_flag + akrr_home_arg) # set database user for AKRR _send_user_password( bash, r'Please specify a database user to access mod_akrr database.*\n\[\S+\]:', akrr_db_user_name, akrr_db_user_password) _send_su_user_password(bash, akrr_db_su_user_name, akrr_db_su_user_password) # AK database: _send_user_password( bash, r'Please specify a database user to access mod_appkernel database.*\n\[\S+\]:', ak_db_user_name, ak_db_user_password) _send_su_user_password(bash, ak_db_su_user_name, ak_db_su_user_password) # XD database: _send_user_password( bash, r'Please specify the user that will be connecting to the XDMoD database.*\n\[\S+\]:', ak_db_user_name, ak_db_user_password) _send_su_user_password(bash, ak_db_su_user_name, ak_db_su_user_password) bash.expectSendline( r'.*INPUT.* Please enter the e-mail where cron will send messages.*\n', "" if cron_email is None else cron_email) # wait for prompt bash.justExpect(bash.prompt, timeout=60) log.info(bash.output) if bash.output.count("AKRR is set up and is running.") == 0: log.critical("AKRR was not set up") exit(1) else: log.info("AKRR is set up and is running.") return
def append_to_bashrc(resource): # append environment variables to .bashrc log.info("\nAdding AKRR enviroment variables to resource's .bashrc!\n") if akrr.dry_run: return str_io = io.StringIO() try: sys.stdout = sys.stderr = str_io rsh = akrr.util.ssh.ssh_resource(resource) akrr_header = 'AKRR Remote Resource Environment Variables' out = akrr.util.ssh.ssh_command( rsh, '''if [ -e $HOME/.bashrc ] then if [[ `grep "\#''' + akrr_header + ''' \[Start\]" $HOME/.bashrc` == *"''' + akrr_header + ''' [Start]"* ]] then echo "Updating AKRR record in $HOME/.bashrc, backing to $HOME/.bashrc_akrrbak" cp $HOME/.bashrc $HOME/.bashrc_akrrbak head -n "$(( $(grep -n '\#''' + akrr_header + ''' \[Start\]' $HOME/.bashrc_akrrbak | head -n 1 | cut -d ":" -f 1) - 1 ))" $HOME/.bashrc_akrrbak > $HOME/.bashrc tail -n "+$(( $(grep -n '\#''' + akrr_header + ''' \[End\]' $HOME/.bashrc_akrrbak | head -n 1 | cut -d ":" -f 1) + 1 ))" $HOME/.bashrc_akrrbak >> $HOME/.bashrc fi fi''') log.debug(out) cmds = ('''echo "Appending AKRR records to $HOME/.bashrc"''', '''echo "#''' + akrr_header + ''' [Start]" >> $HOME/.bashrc''', '''echo "export AKRR_NETWORK_SCRATCH=\\"''' + resource['network_scratch'] + '''\\"" >> $HOME/.bashrc''', '''echo "export AKRR_LOCAL_SCRATCH=\\"''' + resource['local_scratch'] + '''\\"" >> $HOME/.bashrc''', '''echo "export AKRR_APPKER_DIR=\\"''' + resource['appkernel_dir'] + '''\\"" >> $HOME/.bashrc''', '''echo "export AKRR_AKRR_DIR=\\"''' + resource['akrr_data'] + '''\\"" >> $HOME/.bashrc''', '''echo "#''' + akrr_header + ''' [End]" >> $HOME/.bashrc''', '''echo "Appending AKRR records to $HOME/.bashrc"''') for cmd in cmds: out = akrr.util.ssh.ssh_command(rsh, cmd) log.debug(out) rsh.close(force=True) del rsh sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ except Exception as e: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ log.critical( "Can not connect to %s\nProbably invalid credential, see full error report:\n%s", resource['name'], str_io.getvalue()) raise e
def _config_setup(): if cfg.which_akrr is None: log.critical("Can not find akrr. It should be in PATH or set in conf.") exit(1) # set config globals().update(cfg.yml["setup"]) if cfg.dry_run: global dry_run_flag dry_run_flag = " --dry-run "
def set_default_value_for_unset_vars(): """post process settings""" import os from .util import run_cmd_getoutput from akrr.util import log global which_akrr global akrr_conf global akrr_conf_dir global akrr_home_dir global default_akrr_home_dir global akrr_log_dir global in_source_install global rpm_install global dev_install if which_akrr is None or which_akrr == "akrr": try: which_akrr = run_cmd_getoutput("which akrr").strip() except Exception as e: log.critical("Can not find akrr executable") raise e if os.path.dirname(which_akrr) == "/usr/bin": rpm_install = True if os.path.dirname(which_akrr) == "/usr/local/bin": dev_install = True else: in_source_install = True # set default_akrr_home_dir if in_source_install: default_akrr_home_dir = os.path.abspath( os.path.dirname(os.path.dirname(which_akrr))) elif rpm_install or dev_install: default_akrr_home_dir = os.path.expanduser("~/akrr") if akrr_home_dir is None: akrr_home_dir = default_akrr_home_dir else: akrr_home_dir = os.path.expanduser(akrr_home_dir) akrr_conf_dir = os.path.join(akrr_home_dir, "etc") akrr_conf = os.path.join(akrr_home_dir, "etc", 'akrr.conf') akrr_log_dir = os.path.join(akrr_home_dir, "log") log.debug("AKRR conf dir and log dir locations:\n" " akrr_home: {}\n" " akrr_conf: {}\n" " akrr_conf_dir: {}\n" " akrr_log_dir: {}\n" "".format(akrr_home_dir, akrr_conf, akrr_conf_dir, akrr_log_dir))
def start_daemon(): """ Start the daemon """ log.info("Starting AKRR daemon") if akrr.dry_run: return akrr_cli = os.path.join(_akrr_bin_dir, 'akrr') status = subprocess.call(akrr_cli + " daemon start", shell=True) if status != 0: log.critical("AKRR daemon didn't start.") exit(status)
def check_connection_to_rest_api(): # get check connection try: r = akrrrestclient.get('/scheduled_tasks') if r.status_code != 200: log.error( "Can not get token for AKRR REST API ( %s )\nSee server response below\n%s", akrrrestclient.restapi_host, json.dumps(r.json(), indent=4)) exit(1) except Exception as e: log.critical( "Can not connect to AKRR REST API ( %s )\nIs it running?\nSee full error report below", akrrrestclient.restapi_host) raise e
def validate_resource_parameter_file(resource_name): """validate resource parameter file and return dictionary with resource configuration""" # @todo reuse cfg.verify_resource_params default_resource_param_filename = os.path.join(cfg.akrr_mod_dir, "default_conf", "default.resource.conf") resource_param_filename = os.path.join(cfg.cfg_dir, "resources", resource_name, "resource.conf") log.info("Validating %s parameters from %s", resource_name, resource_param_filename) if not os.path.isfile(resource_param_filename): log.error("resource parameters file (%s) does not exist!", resource_param_filename) exit(1) # check syntax try: tmp = {} exec( compile( open(default_resource_param_filename).read(), default_resource_param_filename, 'exec'), tmp) exec( compile( open(resource_param_filename).read(), resource_param_filename, 'exec'), tmp) except Exception as e: log.critical( "Can not load resource from %s.\nProbably invalid syntax.", resource_param_filename) raise e resource = None try: # now we can load akrr, parameters checking did h resource = cfg.find_resource_by_name(resource_name) except Exception as e: log.error("Can not load resource config from %s!\n%s\n%s", resource_param_filename, str(e), traceback.format_exc()) exit(1) log.info( "Syntax of %s is correct and all necessary parameters are present.", resource_param_filename) log.empty_line() return resource
def submit_test_job(resource, app_name="test", nodes=2): # submit test job r = None try: payload = { 'resource': resource['name'], 'app': app_name, 'resource_param': "{'nnodes':%d}" % nodes, 'task_param': "{'test_run':True}" } r = akrrrestclient.post('/scheduled_tasks', data=payload) if r.status_code != 200: log.error( "Can not submit task through AKRR REST API ( %s )\nSee server response below\n%s\n", akrrrestclient.restapi_host, json.dumps(r.json(), indent=4)) exit(1) task_id = r.json()['data']['data']['task_id'] except Exception as e: if r is not None: log.critical( "Can not submit task through AKRR REST API ( %s )\n" "Is it still running?\nSee full error report below\n%s", akrrrestclient.restapi_host, r.json()) else: log.critical( "Can not submit task through AKRR REST API ( %s )\n" "Is it still running?\n", akrrrestclient.restapi_host) raise e # write file with task_id test_job_lock_filename = get_test_job_lock_filename(resource, app_name) with open(test_job_lock_filename, "w") as fout: print(task_id, file=fout) log.info("\nSubmitted test job to AKRR, task_id is %d\n", task_id) return task_id
def process_common_args(cli_args): from . import cfg if "cfg" in cli_args: cfg.load_cfg(cli_args.cfg) if "verbose" in cli_args and cli_args.verbose: log.basicConfig(level=log.DEBUG) log.getLogger().setLevel(log.DEBUG) cfg.verbose = True if "very_verbose" in cli_args and cli_args.very_verbose: log.basicConfig(level=1) log.getLogger().setLevel(1) if "dry_run" in cli_args and cli_args.dry_run: cfg.dry_run = cli_args.dry_run if "which_akrr" in cli_args and cli_args.which_akrr is not None: cfg.which_akrr = cli_args.which_akrr if cfg.which_akrr != "akrr" and not os.path.exists(cfg.which_akrr): log.critical("Path to akrr is incorrect. Can not find " + cfg.which_akrr) cfg.set_default_value_for_unset_vars()
from akrr.util.sql import set_user_password_host_port_db from akrr.util.sql import db_exist from akrr.util.sql import cv from akrr.util.sql import db_check_priv from akrr.util.sql import get_db_client_host from akrr.util.sql import create_user_if_not_exists import akrr.update from akrr.util import make_dirs from akrr.akrrversion import akrrversion # Since AKRR setup is the first script to execute # Lets check python version, proper library presence and external commands. # Python version if sys.version_info.major < 3 or sys.version_info.minor < 4: log.critical("Python should be of version 3.4+. This one is " + sys.version) exit(1) # check openssl presence try: subprocess.check_output("which openssl", shell=True) except Exception as _e: log.error("""openssl program is not available. Install it! For example by running on CentOS sudo yum install openssl openssh-clients on Ubuntu: sudo apt-get install openssl""") raise _e _akrr_dirs = akrr.get_akrr_dirs()
def analyse_test_job_results(task_id, resource, app_name="test"): """analysing the output""" log.info("Test job is completed analyzing output\n") test_job_lock_filename = get_test_job_lock_filename(resource, app_name) r = akrrrestclient.get('/tasks/%d' % task_id) if r.status_code != 200: log.error( "Can not get information about task\nSee full error report below\nAKRR server response:\n%s\n", r.text) exit(1) completed_tasks = r.json()['data']['data']['completed_tasks'] akrr_xdmod_instance_info = r.json( )['data']['data']['akrr_xdmod_instanceinfo'] akrr_errmsg = r.json()['data']['data'].get('akrr_errmsg', "None") results_summary = make_results_summary(resource['name'], app_name, completed_tasks, akrr_xdmod_instance_info, akrr_errmsg) if completed_tasks['status'].count("ERROR") > 0: # execution was not successful if completed_tasks['status'].count( "ERROR Can not created batch job script and submit it to remote queue" ) > 0: log.error( "Can not created batch job script and/or submit it to remote queue\nSee full error report below\n%s", results_summary) else: log.error("Status: %s\nSee full error report below\n%s", completed_tasks['status'], results_summary) os.remove(test_job_lock_filename) exit(1) if akrr_xdmod_instance_info['status'] == 0: # execution was not successful log.error( "Task execution was not successful\nSee full error report below\n%s", results_summary) os.remove(test_job_lock_filename) exit(1) # see what is in report elm_perf = xml.etree.ElementTree.fromstring( akrr_xdmod_instance_info['body']) elm_parameters = elm_perf.find('benchmark').find('parameters') elm_statistics = elm_perf.find('benchmark').find('statistics') parameters = {'RunEnv:Nodes': '', 'App:ExeBinSignature': ''} statistics = { 'Wall Clock Time': '0.0', 'Network scratch directory exists': '0', 'Network scratch directory accessible': '0', 'App kernel input exists': '0', 'Task working directory accessible': '0', 'local scratch directory accessible': '0', 'local scratch directory exists': '0', 'App kernel executable exists': '0', 'Task working directory exists': '0', 'Shell is BASH': '0' } for elm in list(elm_parameters): variable = elm.findtext('ID') if variable is not None: variable = variable.strip() value = elm.findtext('value') if value is not None: value = value.strip() units = elm.findtext('units') if units is not None: units = units.strip() if variable == 'App:ExeBinSignature' or variable == 'RunEnv:Nodes': value = os.popen('echo "%s"|base64 -d|gzip -d' % (value, )).read() log.debug2("parameter: {} = {} {}".format(variable, value, units)) parameters[variable] = value for elm in list(elm_statistics): variable = elm.findtext('ID') if variable is not None: variable = variable.strip() value = elm.findtext('value') if value is not None: value = value.strip() units = elm.findtext('units') if units is not None: units = units.strip() statistics[variable] = value log.debug2("statistic: {} = {} {}".format(variable, value, units)) files_exists = [ 'Network scratch directory exists', 'App kernel input exists', 'local scratch directory exists', 'App kernel executable exists', 'Task working directory exists' ] dirs_access = [ 'Network scratch directory accessible', 'Task working directory accessible', 'local scratch directory accessible' ] if statistics['Shell is BASH'] == '0': log.error( "Shell on compute nodes of %s is not BASH, change it to bash and try again.\n", resource['name']) log.error_count += 1 for file_exists in files_exists: if statistics[file_exists] == '0': log.error(file_exists.replace('exists', 'does not exist')) log.error_count += 1 for dirAccess in dirs_access: if statistics[dirAccess] == '0': log.error(dirAccess.replace('accessible', 'is not accessible')) log.error_count += 1 if parameters['App:ExeBinSignature'] == '': log.error( "Application signature calculator is not working, you might need to recompile it." "see application output for more hints") log.error_count += 1 if resource['batch_scheduler'].lower() != "openstack": # test the nodes, log to headnode and ping them if parameters['RunEnv:Nodes'] == '': log.error( "Nodes are not detected, check batch_job_template and setup of AKRR_NODELIST variable" ) log.error_count += 1 nodes = parameters['RunEnv:Nodes'].split() requested_nodes = eval(completed_tasks['resource_param'])['nnodes'] str_io = io.StringIO() try: sys.stdout = sys.stderr = str_io rsh = akrr.util.ssh.ssh_resource(resource) number_of_unknown_hosts = 0 for node in set(nodes): log.debug2(node) out = akrr.util.ssh.ssh_command(rsh, "ping -c 1 %s" % node) if out.count("unknown host") > 0: number_of_unknown_hosts += 1 rsh.close(force=True) del rsh sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ if number_of_unknown_hosts > 0: log.error( "ERROR %d: Can not ping compute nodes from head node\n" % (log.error_count + 1) + "Nodes on which test job was executed detected as " + parameters['RunEnv:Nodes'] + "\n" + "If these names does not have sense check batch_job_template and setup of AKRR_NODELIST " "variable in resource configuration file") log.error_count += 1 except Exception as e: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ log.critical( "Can not connect to %s\nProbably invalid credential, see full error report:\n%s", resource['name'], str_io.getvalue()) raise e # check ppn count if requested_nodes * resource['ppn'] != len(nodes): log.error( "ERROR {}: Number of requested processes (processes per node * nodes) " "do not match actual processes executed" "Either\n" " AKRR_NODELIST variable is set incorrectly\n" "Or\n" " processes per node (PPN) is wrong\n".format( log.error_count + 1)) log.error_count += 1 log.info("\nTest kernel execution summary:\n%s", results_summary) log.info("\nThe output looks good.\n")
def copy_exec_sources_and_inputs(rsh, resource): """Copy exec sources and inputs to remote resource""" log.info( "Preparing to copy application signature calculator,\n" " app. kernel input files and \n" " HPCC, IMB, IOR and Graph500 source code to remote resource\n") try: akrr.util.ssh.ssh_command(rsh, "cd %s" % resource['appkernel_dir']) out = akrr.util.ssh.ssh_command(rsh, "ls " + resource['appkernel_dir']) files_in_appker_dir = out.strip().split() if not ("inputs" in files_in_appker_dir or "inputs/" in files_in_appker_dir): log.info("Copying app. kernel input tarball to %s", resource['appkernel_dir']) if not akrr.dry_run: akrr.util.ssh.scp_to_resource( resource, cfg.appker_repo_dir + "/inputs.tar.gz", resource['appkernel_dir']) log.info("Unpacking app. kernel input files to %s/inputs", resource['appkernel_dir']) if not akrr.dry_run: out = akrr.util.ssh.ssh_command( rsh, "tar xvfz %s/inputs.tar.gz" % resource['appkernel_dir']) log.debug(out) out = akrr.util.ssh.ssh_command( rsh, "du -h %s/inputs" % resource['appkernel_dir']) log.debug(out) if out.count("No such file or directory") == 0: log.info("App. kernel input files are in %s/inputs\n", resource['appkernel_dir']) else: raise Exception("files are not copied!") else: log.warning_count += 1 log.warning( "WARNING %d: App. kernel inputs directory %s/inputs is present, assume they are correct.\n", log.warning_count, resource['appkernel_dir']) if not ("execs" in files_in_appker_dir or "execs/" in files_in_appker_dir): log.info( "Copying app. kernel execs tarball to %s\n" % (resource['appkernel_dir']) + "It contains HPCC,IMB,IOR and Graph500 source code and app.signature calculator" ) if not akrr.dry_run: akrr.util.ssh.scp_to_resource( resource, cfg.appker_repo_dir + "/execs.tar.gz", resource['appkernel_dir']) log.info( "Unpacking HPCC,IMB,IOR and Graph500 source code and app.signature calculator files to %s/execs", resource['appkernel_dir']) if not akrr.dry_run: out = akrr.util.ssh.ssh_command( rsh, "tar xvfz %s/execs.tar.gz" % resource['appkernel_dir']) log.debug(out) out = akrr.util.ssh.ssh_command( rsh, "df -h %s/execs" % resource['appkernel_dir']) log.debug(out) if out.count("No such file or directory") == 0: log.info( "HPCC,IMB,IOR and Graph500 source code and app.signature calculator are in %s/execs\n", resource['appkernel_dir']) else: raise Exception("files are not copied!") else: log.warning_count += 1 log.warning( "WARNING %d: App. kernel executables directory %s/execs is present, assume they are correct.", log.warning_count, resource['appkernel_dir']) log.warning( "It should contain HPCC,IMB,IOR and Graph500 source code and app.signature calculator\n" ) akrr.util.ssh.ssh_command(rsh, "rm execs.tar.gz inputs.tar.gz") except Exception as e: log.critical("Can not copy files to %s", resource['name']) raise e