def _read_username_password(prompt="Enter username:"******"user", password_on_default_user=None): log.log_input(prompt) if username is None: username = input('[{0}] '.format(default_username)) if username == '': username = default_username else: log.info("User, " + username + ", already entered.") if username == default_username and password is None and password_on_default_user is not None: password = password_on_default_user if password is None: while True: log.log_input("Please specify a password:"******"Please reenter the password:"******"Entered passwords do not match. Please try again.") else: log.info("Password already entered.") return username, password
def connect_to_resource(resource): """connect to resource defined in resource dictionary""" log.info("Validating resource accessibility. Connecting to %s.", resource['name']) if resource['sshPrivateKeyFile'] is not None and os.path.isfile( resource['sshPrivateKeyFile']) is False: log.error("Can not access ssh private key (%s)" "", resource['sshPrivateKeyFile']) exit(1) str_io = io.StringIO() try: sys.stdout = sys.stderr = str_io rsh = cfg.sshResource(resource) sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ log.info("Successfully connected to %s\n", resource['name']) log.empty_line() return rsh except akrrError: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ log.critical("Can not connect to %s\nMessage:\n%s", resource['name'], str_io.getvalue()) exit(1)
def get_system_characteristics(): """detect system characteristics or ask user about them""" global ppn while True: try: log.log_input("Enter processors (cores) per node count:") ppn = int(input("")) break except (ValueError, TypeError): log.error("Incorrect entry, try again.")
def check_shell(rsh, resource): log.info("Checking if shell is BASH\n") msg = cfg.sshCommand(rsh, "echo $BASH") if msg.count("bash") > 0: log.info("Shell is BASH\n") else: log.error( "Shell on headnode of %s is not BASH, change it to bash and try again.\n", resource['name']) exit(1)
def check_previous_installation(): if os.path.exists(akrr_cfg): msg = "This is a fresh installation script. " + akrr_home + \ " contains previous AKRR installation. Either uninstall it or see documentation on updates.\n\n" msg += "To uninstall AKRR manually:\n\t1)remove " + akrr_cfg + "\n\t\trm " + akrr_cfg + "\n" msg += "\t2) (optionally for totally fresh start) drop mod_akrr and mod_appkernel database\n" msg += "\t\tDROP DATABASE mod_appkernel;\n" msg += "\t\tDROP DATABASE mod_akrr;\n\n" log.error(msg) exit(1)
def init_mysql_dbs(self): try: def _create_db_user_gran_priv_if_needed(con_fun, user, password, db, priv): log.info("Creating %s and user to access it if needed" % (db, )) su_con, su_cur = con_fun(True, None) client_host = get_db_client_host(su_cur) _cursor_execute( su_cur, "CREATE DATABASE IF NOT EXISTS %s" % (cv(db), )) su_cur.execute( "SELECT * FROM mysql.user WHERE User=%s AND Host=%s", (user, client_host)) if len(su_cur.fetchall()) == 0: # Older version of MySQL do not support CREATE USER IF NOT EXISTS # so need to do checking _cursor_execute(su_cur, "CREATE USER %s@%s IDENTIFIED BY %s", (user, client_host, password)) _cursor_execute( su_cur, "GRANT " + cv(priv) + " ON " + cv(db) + ".* TO %s@%s", (user, client_host)) su_con.commit() # During self.read_db_creds db and user was checked and # if they do not exist or not good enough super user credentials # was asked so if they not None that means that # either user or db or user priv needed to be set if self.akrr_db_su_user_name is not None: _create_db_user_gran_priv_if_needed(self.get_akrr_db, self.akrr_db_user_name, self.akrr_db_user_password, self.akrr_db_name, "ALL") if self.ak_db_su_user_name is not None: _create_db_user_gran_priv_if_needed(self.get_ak_db, self.ak_db_user_name, self.ak_db_user_password, self.ak_db_name, "ALL") if self.xd_db_su_user_name is not None: _create_db_user_gran_priv_if_needed(self.get_xd_db, self.xd_db_user_name, self.xd_db_user_password, self.xd_db_name, "SELECT") except Exception as e: import traceback traceback.print_exc() log.error("Can not execute the sql setup script: " + str(e)) exit(1)
def run(self): """execute what asked in command line""" log.info("AKRR Regression Tests") cli_args = self.root_parser.parse_args() self.process_common_args(cli_args) if hasattr(cli_args, "func"): cli_args.func(cli_args) else: log.error("There is no command specified!")
def check_utils(): from distutils.spawn import find_executable errmsg = "" if not find_executable('ssh'): errmsg += "Can not find ssh in PATH, please install it.\n" if not find_executable('openssl'): errmsg += "Can not find openssl in PATH, please install it.\n" if errmsg != "": log.error(errmsg) exit(1)
def check_connection_to_rest_api(): # get check connection try: r = akrrrestclient.get('/scheduled_tasks') if r.status_code != 200: log.error( "Can not get token for AKRR REST API ( %s )\nSee server response below\n%s", akrrrestclient.restapi_host, json.dumps(r.json(), indent=4)) exit(1) except Exception as e: log.critical( "Can not connect to AKRR REST API ( %s )\nIs it running?\nSee full error report below", akrrrestclient.restapi_host) raise e
def check_appsig(rsh, resource): log.info("Testing app.signature calculator on headnode\n") out = cfg.sshCommand( rsh, "%s/execs/bin/appsigcheck.sh `which md5sum`" % (resource['appKerDir'], )) if out.count("===ExeBinSignature===") > 0 and out.count("MD5:") > 0: log.info("App.signature calculator is working on headnode\n") else: if dry_run: log.dry_run("App.signature calculator is not working\n") return log.error( "App.signature calculator is not working\n" + "See full error report below\n%s", out) exit(1)
def _read_sql_su_credentials(host, port): while True: log.log_input( "Please provide an administrative database user (for {}:{}) " "under which the installation sql script should " "run (This user must have privileges to create " "users and databases).".format(host, port)) su_username = input("Username: "******"Please provide the password for the the user which you previously entered:" ) su_password = getpass.getpass() try: get_con_to_db(su_username, su_password, host, port) return su_username, su_password except Exception as e: log.error("MySQL error: " + str(e)) log.error("Entered credential is not valid. Please try again.")
def check_rw_db(connection_func, pre_msg, post_msg): """ Check that the user has the correct privileges to the database at the end of the connection provided by 'connection_func'. Specifically, checking for read / write permissions ( and create table ). :type connection_func function :type pre_msg str :type post_msg str :param connection_func: the function that will provide a (connection, cursor) tuple. :param pre_msg: a message to be provided to the user before the checks begin. :param post_msg: a message to be provided to the user after the checks are successful :return: true if the database is available / the provided user has the correct privileges. """ success = False log.info(pre_msg) try: connection, cursor = connection_func() try: with connection: result = cursor.execute( "CREATE TABLE CREATE_ME(`id` INT NOT NULL PRIMARY KEY, `name` VARCHAR(48));" ) success = True if result == 0 else False if success: log.info(post_msg, success) else: log.error(post_msg, success) except MySQLdb.Error as e: log.error( 'Unable to create a table w/ the provided username. %s: %s', e.args[0], e.args[1]) connection, cursor = connection_func() try: with connection: cursor.execute("DROP TABLE CREATE_ME;") except MySQLdb.Error as e: log.error( 'Unable to drop the table created to check permissions. %s: %s', e.args[0], e.args[1]) except MySQLdb.Error as e: log.error('Unable to connect to Database. %s: %s', e.args[0], e.args[1]) return success
def db_check(mod_akrr=True, mod_appkernel=True, modw=True): from akrr import cfg overall_success = True # CHECK: the akrr db akrr_ok = check_rw_db(cfg.getDB, "Checking 'mod_akrr' Database / User privileges...", "'mod_akrr' Database check complete - Status: %s") if mod_akrr: overall_success = overall_success and akrr_ok # Check: the app_kernel db app_kernel_ok = check_rw_db( cfg.getAKDB, "Checking 'mod_appkernel' Database / User privileges...", "'mod_appkernel' Database check complete - Status: %s") if mod_appkernel: overall_success = overall_success and app_kernel_ok # CHECK: the XDMoD db xdmod_ok = check_r_db(cfg.getXDDB, "Checking 'modw' Database / User privileges...", "'modw' Database check complete - Status: %s") if modw: overall_success = overall_success and xdmod_ok # DETERMINE: whether or not everything passed. if overall_success: log.info("All Databases / User privileges check out!") return True else: log.error( "One or more of the required databases and their required users ran into a problem. Please take note of the previous messages, correct the issue and re-run this script." ) return False
def init_dir(self): try: log.info("Creating directories structure.") if not os.path.isdir(akrr_home): _make_dirs(akrr_home) if not os.path.isdir(os.path.join(akrr_home, 'etc')): _make_dirs(os.path.join(akrr_home, 'etc')) if not os.path.isdir(os.path.join(akrr_home, 'etc', 'resources')): _make_dirs(os.path.join(akrr_home, 'etc', 'resources')) if not os.path.isdir(os.path.join(akrr_home, 'etc', 'resources')): _make_dirs(os.path.join(akrr_home, 'etc', 'resources')) if not os.path.isdir(os.path.join(akrr_home, 'log')): _make_dirs(os.path.join(akrr_home, 'log')) if not os.path.isdir(os.path.join(akrr_home, 'log', 'data')): _make_dirs(os.path.join(akrr_home, 'log', 'data')) if not os.path.isdir(os.path.join(akrr_home, 'log', 'comptasks')): _make_dirs(os.path.join(akrr_home, 'log', 'comptasks')) if not os.path.isdir(os.path.join(akrr_home, 'log', 'akrrd')): _make_dirs(os.path.join(akrr_home, 'log', 'akrrd')) except Exception as e: log.error("Can not create directories: " + str(e)) exit(1)
def _remove_from_crontab(remove_mailto=False): """remove from cron""" try: crontab_content = subprocess.check_output("crontab -l", shell=True) except subprocess.CalledProcessError: log.error("Can not run crontab -l") return new_crontab = False crontab_content = crontab_content.decode("utf-8").splitlines(True) with open(os.path.expanduser('.crontmp'), 'w') as f: for l in crontab_content: not_akrr = True if l.count('akrr') > 0 and (l.count('checknrestart.sh') > 0 or l.count('restart.sh') > 0): not_akrr = False if remove_mailto and l.count('MAILTO') > 0: not_akrr = False if not_akrr: f.write(l) else: new_crontab = True if new_crontab: log.info("AKRR Section present in crontab. Cleaning crontab.") try: if not dry_run: output = subprocess.check_output("crontab .crontmp", shell=True).decode("utf-8") log.debug(output) else: log.info("DRY RUN: should run `crontab .crontmp`. .crontmp:" + open(".crontmp", "rt").read()) except subprocess.CalledProcessError: log.error("Can not run crontab .crontmp") os.remove(".crontmp") else: log.info("There was no AKRR records detected in crontab list")
def check_dir(sh, d, exit_on_fail=True, try_to_create=True): status, msg = check_dir_simple(sh, d) if try_to_create is True and status is None: log.info("Directory %s:%s does not exists, will try to create it", sh.remotemachine, d) if not dry_run: cmd = "mkdir -p \"%s\"" % (d, ) cfg.sshCommand(sh, cmd) status, msg = check_dir_simple(sh, d) else: status, msg = (True, "Directory exist and accessible for read/write") if exit_on_fail is False: return status, msg if status is None: log.error("Directory %s:%s does not exists!", sh.remotemachine, d) exit() elif status is True: return True, msg else: log.error("Directory %s:%s is NOT accessible for read/write!", sh.remotemachine, d) exit()
def resource_deploy(resource_name, test_appkernel=None, test_nodes=None, deploy_timeout=600, **_): bash = get_bash() bash.output = "" bash.timeoutMessage = 'Unexpected behavior of prep.sh (premature EOF or TIMEOUT)' bash.runcmd('which python3', printOutput=True) bash.runcmd('which ' + cfg.which_akrr, printOutput=True) # now deploy cmd = "{}{} resource deploy{}{}{}{}".format( cfg.which_akrr, " -v" if cfg.verbose else "", " -r " + resource_name, " -a {}".format(test_appkernel) if test_appkernel is not None else "", " --dry-run" if cfg.dry_run else "", " -n" if test_nodes is not None else "") bash.startcmd(cmd+" > out") bash.justExpect(bash.prompt, timeout=deploy_timeout) out=open("out", "rt").read() if out.count("you can move to next step")==0: log.error("Unsuccessful deployment\n"+out) exit(1)
def submit_test_job(resource, app_name="test", nodes=2): # submit test job r = None try: payload = { 'resource': resource['name'], 'app': app_name, 'resource_param': "{'nnodes':%d}" % nodes, 'task_param': "{'test_run':True}" } r = akrrrestclient.post('/scheduled_tasks', data=payload) if r.status_code != 200: log.error( "Can not submit task through AKRR REST API ( %s )\nSee server response below\n%s\n", akrrrestclient.restapi_host, json.dumps(r.json(), indent=4)) exit(1) task_id = r.json()['data']['data']['task_id'] except Exception as e: if r is not None: log.critical( "Can not submit task through AKRR REST API ( %s )\n" "Is it still running?\nSee full error report below\n%s", akrrrestclient.restapi_host, r.json()) else: log.critical( "Can not submit task through AKRR REST API ( %s )\n" "Is it still running?\n", akrrrestclient.restapi_host) raise e # write file with task_id test_job_lock_filename = get_test_job_lock_filename(resource, app_name) with open(test_job_lock_filename, "w") as fout: print(task_id, file=fout) log.info("\nSubmitted test job to AKRR, task_id is %d\n", task_id) return task_id
def enable_resource_for_execution(resource): """populate mod_appkernel database and allow execution of jobs on this resource""" if dry_run: return resource_name = resource['name'] try: con_ak, cur_ak = cfg.getAKDB(True) cur_ak.execute('''SELECT * FROM resource WHERE nickname=%s''', (resource_name, )) resource_in_ak_db = cur_ak.fetchall() if len(resource_in_ak_db) == 0: log.warning( "There is no record of %s in mod_appkernel.resource will add one.", resource_name) cur_ak.execute( '''INSERT INTO resource (resource,nickname,description,enabled,visible) VALUES(%s,%s,%s,0,0);''', (resource['name'], resource['name'], resource['info'])) con_ak.commit() cur_ak.execute('''SELECT * FROM resource WHERE nickname=%s''', (resource_name, )) resource_in_ak_db = cur_ak.fetchall() resource_in_ak_db = resource_in_ak_db[0] # enable and make visible cur_ak.execute( '''UPDATE resource SET enabled=1,visible=1 WHERE resource_id=%s;''', (resource_in_ak_db['resource_id'], )) con_ak.commit() log.info( "Enabled %s in mod_appkernel.resource for tasks execution and made it visible to XDMoD UI.", resource_name) except MySQLdb.Error: log.error("Can not connect to AK DB\n" "Probably invalid credential") # enabling resource for execution try: r = akrrrestclient.put('/resources/' + resource_name + '/on') if r.status_code == 200: log.info('Successfully enabled ' + resource_name) else: log.error( "Can not enable resource through AKRR REST API ( %s )\nSee server response below\n%s", akrrrestclient.restapi_host, json.dumps(r.json(), indent=4)) except requests.RequestException: log.error( "Can not enable resource through AKRR REST API ( %s )\n" "Is it still running?\n", akrrrestclient.restapi_host)
def validate_resource_name(m_resource_name): if m_resource_name.strip() == "": log.error("Bad name for resource, try a different name") return False # check config file presence file_path = os.path.abspath(os.path.join(resources_dir, m_resource_name)) if os.path.exists(file_path): log.error( "Resource configuration directory (%s) for resource with name %s " "already present on file system, try a different name" % ( file_path, m_resource_name, )) return False # check the entry in mod_appkernel con_ak, cur_ak = cfg.getAKDB(True) cur_ak.execute('''SELECT * FROM resource WHERE nickname=%s''', (m_resource_name, )) resource_in_ak_db = cur_ak.fetchall() if len(resource_in_ak_db) != 0: log.error( "Resource with name %s already present in mod_appkernel DB, try a different name" % (m_resource_name, )) return False # check the entry in mod_akrr db, cur = cfg.getDB(True) cur.execute('''SELECT * FROM resources WHERE name=%s''', (m_resource_name, )) resource_in_db = cur.fetchall() if len(resource_in_db) != 0: log.error( "Resource with name %s already present in mod_akrr DB, try a different name" % (m_resource_name, )) return False return True
def check_r_db(connection_func, pre_msg, post_msg): """ Check that the user has the correct privileges to the database at the end of the connection provided by 'connection_func'. Specifically checking for read permissions. :type connection_func function :type pre_msg str :type post_msg str :param connection_func: the function that will provide a (connection, cursor) tuple. :param pre_msg: a message to be provided to the user before the checks begin. :param post_msg: a message to be provided to the user after the checks are successful :return: true if the database is available / the provided user has the correct privileges. """ success = False log.info(pre_msg) try: connection, cursor = connection_func() try: with connection: result = cursor.execute( "SELECT COUNT(*) FROM `modw`.`resourcefact`;") success = True if result >= 0 else False if success: log.info(post_msg, success) else: log.error(post_msg, success) except MySQLdb.Error as e: log.error('Unable to select from `modw`.`resourcefact`. %s: %s', e.args[0], e.args[1]) except MySQLdb.Error as e: log.error('Unable to connect to Database. %s: %s', e.args[0], e.args[1]) return success
from akrr.util.sql import db_check_priv from akrr.util.sql import get_db_client_host # Python version if sys.version_info.major < 3 or sys.version_info.minor < 4: log.critical("Python should be of version 3.4+. This one is " + sys.version) exit(1) # check openssl presence try: subprocess.check_output("which openssl", shell=True) except Exception as _e: log.error("""openssl program is not available. Install it! For example by running on CentOS sudo yum install openssl openssh-clients on Ubuntu: sudo apt-get install openssl""") raise _e # AKRR configuration can be in three places # 1) AKRR_CONF if AKRR_CONF environment variable is defined # 2) ~/akrr/etc/akrr.conf if initiated from RPM or global python install # 3) <path to AKRR sources>/etc/akrr.conf for in source installation in_src_install = False akrr_mod_dir = os.path.dirname( os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))) akrr_bin_dir = None if os.path.isfile(os.path.join(os.path.dirname(akrr_mod_dir), 'bin', 'akrr')):
def get_remote_access_method(): global resource_name global remoteAccessNode global remoteAccessMethod global remoteCopyMethod global sshUserName global sshPassword global sshPassword4thisSession global sshPrivateKeyFile global sshPrivateKeyPassword global rsh global no_ping # set remoteAccessNode while True: log.log_input( "Enter Resource head node (access node) full name (e.g. headnode.somewhere.org):" ) remoteAccessNode = input("[%s] " % resource_name) if remoteAccessNode.strip() == "": remoteAccessNode = resource_name response = os.system("ping -c 1 -w2 " + remoteAccessNode + " > /dev/null 2>&1") if response == 0: break else: if no_ping: log.warning("Can not ping %s, but asked to ignore it.", remoteAccessNode) break log.error("Incorrect head node name (can not ping %s), try again", remoteAccessNode) # set sshUserName current_user = getpass.getuser() ask_for_user_name = True while True: if ask_for_user_name: log.log_input("Enter username for resource access:") sshUserName = input("[%s] " % current_user) if sshUserName.strip() == "": sshUserName = current_user current_user = sshUserName # check password-less access if sshPassword is None: log.info("Checking for password-less access") else: log.info("Checking for resource access") successfully_connected = check_connection_to_resource() if successfully_connected: if sshPassword is None: log.info("Can access resource without password") else: log.info("Can access resource") if successfully_connected is False: log.info("Can not access resource without password") action_list = [( "TryAgain", "The private and public keys was generated manually, right now. Try again." )] # check private keys user_home_dir = os.path.expanduser("~") private_keys = [ os.path.join(user_home_dir, '.ssh', f[:-4]) for f in os.listdir(os.path.join(user_home_dir, '.ssh')) if os.path.isfile(os.path.join(user_home_dir, '.ssh', f)) and f[-4:] == '.pub' and os.path.isfile(os.path.join(user_home_dir, '.ssh', f[:-4])) ] if len(private_keys) > 0: action_list.append(("UseExistingPrivateKey", "Use existing private and public key.")) action_list.append( ("GenNewKey", "Generate new private and public key.")) action_list.append(("UsePassword", "Use password directly.")) log.empty_line() log.info("Select authentication method:" + "\n".join([ "%3d %s" % (i, desc) for i, (_, desc) in enumerate(action_list) ])) while True: log.log_input("Select option from list above:") try: action = input("[2] ") if action.strip() == "": action = 2 else: action = int(action) if action < 0 or action >= len(action_list): raise ValueError() break except (ValueError, TypeError): log.error("Incorrect entry, try again.") # do the action log.empty_line() if action_list[action][0] == "TryAgain": continue if action_list[action][0] == "UsePassword": log.log_input("Enter password for %s@%s:" % (sshUserName, remoteAccessNode)) sshPassword = getpass.getpass("") ask_for_user_name = not ask_for_user_name continue if action_list[action][0] == "UseExistingPrivateKey": log.info("Available private keys:" + "\n".join( ["%3d %s" % (i, p) for i, p in enumerate(private_keys)])) while True: log.log_input("Select key number from list above:") try: i_key = input("") i_key = int(i_key) if i_key < 0 or i_key >= len(private_keys): raise ValueError() break except (ValueError, TypeError): log.error("Incorrect entry, try again.") sshPrivateKeyFile = private_keys[i_key] ask_for_user_name = not ask_for_user_name continue if action_list[action][0] == "GenNewKey": count = 0 while True: log.log_input( "Enter password for %s@%s (will be used only during this session):" % (sshUserName, remoteAccessNode)) sshPassword4thisSession = getpass.getpass("") sshPassword = sshPassword4thisSession if check_connection_to_resource(): break count += 1 if count >= 3: break sshPassword = None # generate keys log.log_input("Enter private key name:") sshPrivateKeyFile = input("[id_rsa_%s]" % resource_name) if sshPrivateKeyFile.strip() == "": sshPrivateKeyFile = "id_rsa_%s" % resource_name sshPrivateKeyFile = os.path.join(user_home_dir, '.ssh', sshPrivateKeyFile) log.log_input( "Enter passphrase for new key (leave empty for passwordless access):" ) sshPrivateKeyPassword = getpass.getpass("") os.system("ssh-keygen -t rsa -N \"%s\" -f %s" % (sshPrivateKeyPassword, sshPrivateKeyFile)) if sshPrivateKeyPassword.strip() == "": sshPrivateKeyPassword = None # copy keys cfg.sshAccess(remoteAccessNode, ssh='ssh-copy-id', username=sshUserName, password=sshPassword4thisSession, PrivateKeyFile=sshPrivateKeyFile, PrivateKeyPassword=None, logfile=sys.stdout, command='') ask_for_user_name = not ask_for_user_name continue if successfully_connected: break else: log.error("Incorrect resource access credential") if successfully_connected: log.empty_line() log.info("Connecting to " + resource_name) str_io = io.StringIO() try: sys.stdout = sys.stderr = str_io rsh = cfg.sshAccess(remoteAccessNode, ssh=remoteAccessMethod, username=sshUserName, password=sshPassword, PrivateKeyFile=sshPrivateKeyFile, PrivateKeyPassword=sshPrivateKeyPassword, logfile=sys.stdout, command=None) sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ except Exception as e: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ log.debug(str_io.getvalue()) raise e log.info(" Done") log.empty_line() return successfully_connected
def resource_add(config): """add resource, config should have following members dry_run - Dry Run No files will actually be created minimalistic - Minimize questions number, configuration files will be edited manually no-ping - do not run ping to test headnode name verbose """ global verbose global dry_run global no_ping global minimalistic global resource_name global remoteAccessNode global remoteAccessMethod global remoteCopyMethod global sshUserName global sshPassword global sshPrivateKeyFile global sshPrivateKeyPassword global networkScratch global localScratch global akrrData global appKerDir global batchScheduler global batchJobHeaderTemplate if config.verbose: verbose = True log.info("Beginning Initiation of New Resource...") verbose = config.verbose dry_run = config.dry_run resource_deploy.dry_run = config.dry_run no_ping = config.no_ping minimalistic = config.minimalistic log.info("Retrieving Resources from XDMoD Database...") # RETRIEVE: the resources from XDMoD resources = retrieve_resources() log.info("Found following resources from XDMoD Database:\n" + " resource_id name\n" + "\n".join([ " %11d %-40s" % (resource_id, resource_name) for resource_name, resource_id in resources ]) + "\n") if len(resources) > 0: while True: log.log_input( 'Enter resource_id for import (enter 0 for no match):') resource_id = input() if validate_resource_id(resource_id, resources): break log.warning("Incorrect resource_id try again") log.empty_line() resource_id = int(resource_id) else: resource_id = 0 if resource_id <= 0: # i.e. no match from XDMoD DB resource_id = None resource_name = "" while True: if resource_id is None: log.log_input('Enter AKRR resource name:') resource_name = input() else: resource_name2 = get_resource_name_by_id(resource_id, resources) log.log_input( 'Enter AKRR resource name, hit enter to use same name as in XDMoD Database [%s]:' % (resource_name2, )) resource_name = input() if resource_name.strip() == "": resource_name = resource_name2 if validate_resource_name(resource_name): break log.empty_line() while True: log.log_input('Enter queuing system on resource (slurm or pbs): ') queuing_system = input() if validate_queuing_system(queuing_system): break else: log.error("Incorrect queuing_system try again") batchScheduler = queuing_system log.empty_line() if minimalistic is False: get_remote_access_method() get_system_characteristics() get_file_system_access_points() log.debug("Summary of parameters" + "resource_name: {}".format(resource_name) + "remoteAccessNode: {}".format(remoteAccessNode) + "remoteAccessMethod: {}".format(remoteAccessMethod) + "remoteCopyMethod: {}".format(remoteCopyMethod) + "sshUserName: {}".format(sshUserName) + "sshPassword: {}".format(sshPassword) + "sshPrivateKeyFile: {}".format(sshPrivateKeyFile) + "sshPrivateKeyPassword: {}".format(sshPrivateKeyPassword) + "networkScratch: {}".format(networkScratch) + "localScratch: {}".format(localScratch) + "akrrData: {}".format(akrrData) + "appKerDir: {}".format(appKerDir) + "batchScheduler: {}".format(batchScheduler) + "batchJobHeaderTemplate: {}".format(batchJobHeaderTemplate) + "\n") generate_resource_config(resource_id, resource_name, queuing_system) log.info("Initiation of new resource is completed.\n" " Edit batchJobHeaderTemplate variable in {}\n" " and move to resource validation and deployment step.\n" " i.e. execute:\n" " akrr resource deploy -r {}".format( resource_cfg_filename, resource_name))
def get_file_system_access_points(): global resource_name global networkScratch global localScratch global akrrData global appKerDir home_dir = cfg.sshCommand(rsh, "echo $HOME").strip() scratch_network_dir = cfg.sshCommand(rsh, "echo $SCRATCH").strip() # localScratch local_scratch_default = "/tmp" while True: log.log_input( "Enter location of local scratch (visible only to single node):") localScratch = input("[%s]" % local_scratch_default) if localScratch.strip() == "": localScratch = local_scratch_default status, msg = resource_deploy.check_dir_simple(rsh, localScratch) if status: log.info(msg) log.empty_line() break else: log.warning(msg) log.warning( 'local scratch might be have a different location on head node, so if it is by design it is ok' ) log.empty_line() break localScratch = cfg.sshCommand(rsh, "echo %s" % (localScratch, )).strip() # networkScratch network_scratch_default = "" if scratch_network_dir != "": network_scratch_default = scratch_network_dir network_scratch_visible = False while True: log.log_input( "Enter location of network scratch (visible only to all nodes)," "used for temporary storage of app kernel input/output:") if network_scratch_default != "": networkScratch = input("[%s]" % network_scratch_default) if networkScratch.strip() == "": networkScratch = network_scratch_default else: networkScratch = input("") if networkScratch == "": log.error("Incorrect value for networkScratch, try again") continue status, msg = resource_deploy.check_dir(rsh, networkScratch, exit_on_fail=False, try_to_create=True) if status: log.info(msg) network_scratch_visible = True log.empty_line() break else: log.warning(msg) break networkScratch = cfg.sshCommand(rsh, "echo %s" % (networkScratch, )).strip() # appKerDir appker_dir_default = os.path.join(home_dir, "appker", resource_name) while True: log.log_input( "Enter future location of app kernels input and executable files:") appKerDir = input("[%s]" % appker_dir_default) if appKerDir.strip() == "": appKerDir = appker_dir_default status, msg = resource_deploy.check_dir(rsh, appKerDir, exit_on_fail=False, try_to_create=True) if status: log.info(msg) log.empty_line() break else: log.error(msg) appKerDir = cfg.sshCommand(rsh, "echo %s" % (appKerDir, )).strip() # akrrData akrr_data_default = os.path.join(home_dir, "akrr_data", resource_name) if network_scratch_visible: akrr_data_default = os.path.join(networkScratch, "akrr_data", resource_name) while True: log.log_input( "Enter future locations for app kernels working directories (can or even should be on scratch space):" ) akrrData = input("[%s]" % akrr_data_default) if akrrData.strip() == "": akrrData = akrr_data_default status, msg = resource_deploy.check_dir(rsh, akrrData, exit_on_fail=False, try_to_create=True) if status: log.info(msg) log.empty_line() break else: log.error(msg) akrrData = cfg.sshCommand(rsh, "echo %s" % (akrrData, )).strip()
def resource_deploy(args): global dry_run global checking_frequency resource_name = args.resource if 'dry_run' in args: dry_run = args.dry_run if "checking_frequency" in args: checking_frequency = args.checking_frequency if "appkernel" in args: app_name = args.appkernel else: app_name = "test" if "nodes" in args: nodes = int(args.nodes) else: nodes = 2 log.error_count = 0 log.warning_count = 0 # validate resource configuration and get config resource = validate_resource_parameter_file(resource_name) # connect to resource rsh = connect_to_resource(resource) # do tests check_shell(rsh, resource) check_create_dirs(rsh, resource) # deploy inputs and sources copy_exec_sources_and_inputs(rsh, resource) # check that app.signature calculator on headnode check_appsig(rsh, resource) # close connection we don't need it any more rsh.close(force=True) del rsh # run test job to queue run_test_job(resource, app_name, nodes) if log.error_count == 0: append_to_bashrc(resource) enable_resource_for_execution(resource) log.empty_line() log.info("Result:") if log.error_count > 0: log.error("There are %d errors, fix them.", log.error_count) if log.warning_count > 0: log.warning( "There are %d warnings.\nif warnings have sense you can move to next step!\n", log.warning_count) if log.error_count == 0 and log.warning_count == 0: log.info("\nDONE, you can move to next step!\n")
def validate_resource_parameter_file(resource_name): """validate resource parameter file and return dictionary with resource configuration""" default_resource_param_filename = os.path.join(cfg.akrr_mod_dir, "default_conf", "default.resource.conf") resource_param_filename = os.path.join(cfg.cfg_dir, "resources", resource_name, "resource.conf") log.info("Validating %s parameters from %s", resource_name, resource_param_filename) if not os.path.isfile(resource_param_filename): log.error("resource parameters file (%s) does not exist!", resource_param_filename) exit(1) # check syntax try: tmp = {} exec( compile( open(default_resource_param_filename).read(), default_resource_param_filename, 'exec'), tmp) exec( compile( open(resource_param_filename).read(), resource_param_filename, 'exec'), tmp) except Exception as e: log.critical( "Can not load resource from %s.\nProbably invalid syntax.", resource_param_filename) raise e # now we can load akrr resource = cfg.FindResourceByName(resource_name) # check that parameters for presents and type # format: key,type,can be None,must have parameter parameters_types = [ ['info', str, False, False], ['localScratch', str, False, True], ['batchJobTemplate', str, False, True], ['remoteAccessNode', str, False, True], ['name', str, False, False], ['akrrCommonCommandsTemplate', str, False, True], ['networkScratch', str, False, True], ['ppn', int, False, True], # ['akrrStartAppKerTemplate', types.StringType, False,True], ['remoteCopyMethod', str, False, True], ['sshUserName', str, False, True], ['sshPassword', str, True, False], ['sshPrivateKeyFile', str, True, False], ['sshPrivateKeyPassword', str, True, False], ['batchScheduler', str, False, True], ['remoteAccessMethod', str, False, True], ['appKerDir', str, False, True], ['akrrCommonCleanupTemplate', str, False, True], # ['nodeListSetterTemplate', types.StringType, False,True], ['akrrData', str, False, True] ] for variable, m_type, can_be_none, must in parameters_types: if (must is True) and (variable not in resource): log.error("Syntax error in %s\nVariable %s is not set", resource_param_filename, variable) exit(1) if variable not in resource: continue if resource[variable] is None and can_be_none is False: log.error("Syntax error in %s\nVariable %s can not be None", resource_param_filename, variable) exit(1) if not isinstance(resource[variable], m_type) and not ( resource[variable] is None and can_be_none): log.error( "Syntax error in %s\nVariable %s should be %s, but it is %s !", resource_param_filename, variable, str(m_type), type(resource[variable])) exit(1) log.info( "Syntax of %s is correct and all necessary parameters are present.", resource_param_filename) log.empty_line() return resource
def analyse_test_job_results(task_id, resource, app_name="test"): """analysing the output""" log.info("Test job is completed analyzing output\n") test_job_lock_filename = get_test_job_lock_filename(resource, app_name) r = akrrrestclient.get('/tasks/%d' % task_id) if r.status_code != 200: log.error( "Can not get information about task\nSee full error report below\nAKRR server response:\n%s\n", r.text) exit(1) completed_tasks = r.json()['data']['data']['completed_tasks'] akrr_xdmod_instance_info = r.json( )['data']['data']['akrr_xdmod_instanceinfo'] akrr_errmsg = r.json()['data']['data']['akrr_errmsg'] results_summary = make_results_summary(resource['name'], app_name, completed_tasks, akrr_xdmod_instance_info, akrr_errmsg) if completed_tasks['status'].count("ERROR") > 0: # execution was not successful if completed_tasks['status'].count( "ERROR Can not created batch job script and submit it to remote queue" ) > 0: log.error( "Can not created batch job script and/or submit it to remote queue\nSee full error report below\n%s", results_summary) else: log.error("Status: %s\nSee full error report below\n%s", completed_tasks['status'], results_summary) os.remove(test_job_lock_filename) exit(1) if akrr_xdmod_instance_info['status'] == 0: # execution was not successful log.error( "Task execution was not successful\nSee full error report below\n%s", results_summary) os.remove(test_job_lock_filename) exit(1) # see what is in report elm_perf = xml.etree.ElementTree.fromstring( akrr_xdmod_instance_info['body']) elm_parameters = elm_perf.find('benchmark').find('parameters') elm_statistics = elm_perf.find('benchmark').find('statistics') parameters = {'RunEnv:Nodes': '', 'App:ExeBinSignature': ''} statistics = { 'Wall Clock Time': '0.0', 'Network scratch directory exists': '0', 'Network scratch directory accessible': '0', 'App kernel input exists': '0', 'Task working directory accessible': '0', 'local scratch directory accessible': '0', 'local scratch directory exists': '0', 'App kernel executable exists': '0', 'Task working directory exists': '0', 'Shell is BASH': '0' } for elm in list(elm_parameters): variable = elm.findtext('ID') if variable is not None: variable = variable.strip() value = elm.findtext('value') if value is not None: value = value.strip() units = elm.findtext('units') if units is not None: units = units.strip() if variable == 'App:ExeBinSignature' or variable == 'RunEnv:Nodes': value = os.popen('echo "%s"|base64 -d|gzip -d' % (value, )).read() log.debug2("parameter: {} = {} {}".format(variable, value, units)) parameters[variable] = value for elm in list(elm_statistics): variable = elm.findtext('ID') if variable is not None: variable = variable.strip() value = elm.findtext('value') if value is not None: value = value.strip() units = elm.findtext('units') if units is not None: units = units.strip() statistics[variable] = value log.debug2("statistic: {} = {} {}".format(variable, value, units)) files_exists = [ 'Network scratch directory exists', 'App kernel input exists', 'local scratch directory exists', 'App kernel executable exists', 'Task working directory exists' ] dirs_access = [ 'Network scratch directory accessible', 'Task working directory accessible', 'local scratch directory accessible' ] if statistics['Shell is BASH'] == '0': log.error( "Shell on compute nodes of %s is not BASH, change it to bash and try again.\n", resource['name']) log.error_count += 1 for file_exists in files_exists: if statistics[file_exists] == '0': log.error(file_exists.replace('exists', 'does not exist')) log.error_count += 1 for dirAccess in dirs_access: if statistics[dirAccess] == '0': log.error(dirAccess.replace('accessible', 'is not accessible')) log.error_count += 1 if parameters['App:ExeBinSignature'] == '': log.error( "Application signature calculator is not working, you might need to recompile it." "see application output for more hints") log.error_count += 1 # test the nodes, log to headnode and ping them if parameters['RunEnv:Nodes'] == '': log.error( "Nodes are not detected, check batchJobTemplate and setup of AKRR_NODELIST variable" ) log.error_count += 1 nodes = parameters['RunEnv:Nodes'].split() requested_nodes = eval(completed_tasks['resource_param'])['nnodes'] str_io = io.StringIO() try: sys.stdout = sys.stderr = str_io rsh = cfg.sshResource(resource) number_of_unknown_hosts = 0 for node in set(nodes): log.debug2(node) out = cfg.sshCommand(rsh, "ping -c 1 %s" % node) if out.count("unknown host") > 0: number_of_unknown_hosts += 1 rsh.close(force=True) del rsh sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ if number_of_unknown_hosts > 0: log.error( "ERROR %d: Can not ping compute nodes from head node\n" % (log.error_count + 1) + "Nodes on which test job was executed detected as " + parameters['RunEnv:Nodes'] + "\n" + "If these names does not have sense check batchJobTemplate and setup of AKRR_NODELIST " "variable in resource configuration file") log.error_count += 1 except Exception as e: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ log.critical( "Can not connect to %s\nProbably invalid credential, see full error report:\n%s", resource['name'], str_io.getvalue()) raise e # check ppn count if requested_nodes * resource['ppn'] != len(nodes): log.error( "ERROR {}: Number of requested processes (processes per node * nodes) " "do not match actual processes executed" "Either\n" " AKRR_NODELIST variable is set incorrectly\n" "Or\n" " processes per node (PPN) is wrong\n".format(log.error_count + 1)) log.error_count += 1 log.info("\nTest kernel execution summary:\n%s", results_summary) log.info("\nThe output looks good.\n")
def check_connection_to_resource(): """check the connection to remote resource.""" global remoteAccessNode global remoteAccessMethod global remoteCopyMethod global sshUserName global sshPassword global sshPassword4thisSession global sshPrivateKeyFile global sshPrivateKeyPassword successfully_connected = False passphrase_entrance_count = 0 authorize_key_count = 0 while True: # Try to connect str_io = io.StringIO() try: sys.stdout = sys.stderr = str_io cfg.sshAccess(remoteAccessNode, ssh=remoteAccessMethod, username=sshUserName, password=sshPassword, PrivateKeyFile=sshPrivateKeyFile, PrivateKeyPassword=sshPrivateKeyPassword, logfile=str_io, command='ls') sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ successfully_connected = True break except Exception: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ response = str_io.getvalue() log.debug( "Had attempted to access resource without password and failed, below is resource response" + "=" * 80 + str_io.getvalue() + "=" * 80) # check if it asking for passphrase m = re.search(r"Enter passphrase for key '(.*)':", response) if m: if passphrase_entrance_count >= 3: sshPrivateKeyPassword = None sshPrivateKeyFile = None break if passphrase_entrance_count > 0: log.error("Incorrect passphrase try again") sshPrivateKeyFile = m.group(1) log.log_input("Enter passphrase for key '%s':" % sshPrivateKeyFile) sshPrivateKeyPassword = getpass.getpass("") passphrase_entrance_count += 1 continue m2 = re.search(r"[pP]assword:", response) if m is None and sshPrivateKeyFile is not None and m2: log.warning( "Can not login to head node. " "Probably the public key of private key was not authorized on head node" ) log.info( "Will try to add public key to list of authorized keys on head node" ) while True: try: authorize_key_count += 1 log.log_input( "Enter password for %s@%s (will be used only during this session):" % (sshUserName, remoteAccessNode)) sshPassword4thisSession = getpass.getpass("") log.empty_line() str_io = io.StringIO() sys.stdout = sys.stderr = str_io cfg.sshAccess(remoteAccessNode, ssh='ssh-copy-id', username=sshUserName, password=sshPassword4thisSession, PrivateKeyFile=sshPrivateKeyFile, PrivateKeyPassword=None, logfile=str_io, command='') sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ log.info(response) log.info( "Have added public key to list of authorized keys on head node, " "will attempt to connect again.") log.empty_line() break except Exception: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ if verbose: log.debug( "Had attempted to add public key to list of authorized keys on head node and failed, " + "below is resource response" + "=" * 80 + str_io.getvalue() + "=" * 80) log.error("Incorrect password try again.") if authorize_key_count >= 3: break if authorize_key_count < 3: continue break return successfully_connected
def monitor_test_job(task_id): """monitor the job progress, wait till job is done """ completed_tasks = None akrr_xdmod_instance_info = None akrr_errmsg = None msg_body_prev = "" bad_cycles = 0 while True: t = datetime.datetime.now() r = akrrrestclient.get('/tasks/' + str(task_id)) response_json = r.json() if r.status_code == 200: response_json = r.json() msg_body = "Test status:\n" if response_json["data"]["queue"] == "scheduled_tasks": msg_body += "Task is in scheduled_tasks queue.\n" msg_body += "It schedule to be started on" + response_json[ "data"]["data"]['time_to_start'] + "\n" elif response_json["data"]["queue"] == "active_tasks": msg_body += "Task is in active_tasks queue.\n" msg_body += "Status: " + str( response_json["data"]["data"]['status']) + "\n" msg_body += "Status info:\n" + str( response_json["data"]["data"]['statusinfo']) + "\n" elif response_json["data"]["queue"] == "completed_tasks": msg_body += "Task is completed!\n" completed_tasks = r.json()['data']['data']['completed_tasks'] akrr_xdmod_instance_info = r.json( )['data']['data']['akrr_xdmod_instanceinfo'] akrr_errmsg = r.json()['data']['data']['akrr_errmsg'] if log.verbose: msg_body += "completed_tasks table entry:\n" + pp.pformat( completed_tasks) + "\n" msg_body += "akrr_xdmod_instanceinfo table entry:\n" + pp.pformat( akrr_xdmod_instance_info) + "\n" msg_body += 'output parsing results:\n' + akrr_xdmod_instance_info[ 'body'] + "\n" else: msg_body += "\tstatus: " + str( akrr_xdmod_instance_info['status']) + "\n" if akrr_xdmod_instance_info['status'] == 0: msg_body += "\tstatus2: " + completed_tasks[ 'status'] + "\n" msg_body += "\tstatusinfo: " + completed_tasks[ 'statusinfo'] + "\n" else: msg_body += r.text + "\n" tail_msg = "time: " + t.strftime("%Y-%m-%d %H:%M:%S") if msg_body != msg_body_prev: print("\n\n" + msg_body) print(tail_msg, end=' ') sys.stdout.flush() else: print("\r" + tail_msg, end=' ') sys.stdout.flush() msg_body_prev = copy.deepcopy(msg_body) if response_json["data"]["queue"] == "completed_tasks": break else: bad_cycles += 1 if bad_cycles > 3: log.error("Something wrong, REST API said: %s", response_json) break # try to update: try: payload = {'next_check_time': ''} akrrrestclient.put('/active_tasks/' + str(task_id), data=payload) except Exception as e: bad_cycles += 1 if bad_cycles > 10: log.error("Something wrong with REST API") raise e time.sleep(checking_frequency) return completed_tasks, akrr_xdmod_instance_info, akrr_errmsg