def enable_resource_for_execution(resource): """populate mod_appkernel database and allow execution of jobs on this resource""" if dry_run: return resource_name = resource['name'] try: con_ak, cur_ak = cfg.getAKDB(True) cur_ak.execute('''SELECT * FROM resource WHERE nickname=%s''', (resource_name, )) resource_in_ak_db = cur_ak.fetchall() if len(resource_in_ak_db) == 0: log.warning( "There is no record of %s in mod_appkernel.resource will add one.", resource_name) cur_ak.execute( '''INSERT INTO resource (resource,nickname,description,enabled,visible) VALUES(%s,%s,%s,0,0);''', (resource['name'], resource['name'], resource['info'])) con_ak.commit() cur_ak.execute('''SELECT * FROM resource WHERE nickname=%s''', (resource_name, )) resource_in_ak_db = cur_ak.fetchall() resource_in_ak_db = resource_in_ak_db[0] # enable and make visible cur_ak.execute( '''UPDATE resource SET enabled=1,visible=1 WHERE resource_id=%s;''', (resource_in_ak_db['resource_id'], )) con_ak.commit() log.info( "Enabled %s in mod_appkernel.resource for tasks execution and made it visible to XDMoD UI.", resource_name) except MySQLdb.Error: log.error("Can not connect to AK DB\n" "Probably invalid credential") # enabling resource for execution try: r = akrrrestclient.put('/resources/' + resource_name + '/on') if r.status_code == 200: log.info('Successfully enabled ' + resource_name) else: log.error( "Can not enable resource through AKRR REST API ( %s )\nSee server response below\n%s", akrrrestclient.restapi_host, json.dumps(r.json(), indent=4)) except requests.RequestException: log.error( "Can not enable resource through AKRR REST API ( %s )\n" "Is it still running?\n", akrrrestclient.restapi_host)
def check_if_test_job_already_submitted(resource, app_name="test"): """check if the test job is already submitted, return task id if it is submitted""" task_id = None test_job_lock_filename = get_test_job_lock_filename(resource, app_name) if os.path.isfile(test_job_lock_filename): fin = open(test_job_lock_filename, "r") task_id = int(fin.readline()) fin.close() r = akrrrestclient.get('/tasks/' + str(task_id)) if r.status_code != 200: task_id = None else: log.warning_count += 1 log.warning( "\nWARNING %d: Seems this is rerun of this script, will monitor task with task_id = %d ", log.warning_count, task_id) log.warning("To submit new task delete %s\n", test_job_lock_filename) # check how old is it return task_id
def check_create_dirs(rsh, resource): log.info("Checking directory locations\n") d = resource['akrrData'] log.info("Checking: %s:%s", resource['remoteAccessNode'], d) status, msg = check_dir(rsh, d, exit_on_fail=True, try_to_create=True) log.info(msg) d = resource['appKerDir'] log.info("Checking: %s:%s", resource['remoteAccessNode'], d) status, msg = check_dir(rsh, d, exit_on_fail=True, try_to_create=True) log.info(msg) d = resource['networkScratch'] log.info("Checking: %s:%s", resource['remoteAccessNode'], d) status, msg = check_dir(rsh, d, exit_on_fail=False, try_to_create=False) if status is True: log.info(msg) else: log.warning_count += 1 log.warning(msg) log.warning( "WARNING %d: network scratch might be have a different location on head node, " "so if it is by design it is ok", log.warning_count) d = resource['localScratch'] log.info("Checking: %s:%s", resource['remoteAccessNode'], d) status, msg = check_dir(rsh, d, exit_on_fail=False, try_to_create=False) if status is True: log.info(msg) else: log.warning_count += 1 log.warning(msg) log.warning( "WARNING %d: local scratch might be have a different location on head node, " "so if it is by design it is ok", log.warning_count) log.empty_line()
def run_it(_): log.warning("add_command_install is not implemented")
def run_it(_): log.warning("add_command_build is not implemented")
def resource_deploy(args): global dry_run global checking_frequency resource_name = args.resource if 'dry_run' in args: dry_run = args.dry_run if "checking_frequency" in args: checking_frequency = args.checking_frequency if "appkernel" in args: app_name = args.appkernel else: app_name = "test" if "nodes" in args: nodes = int(args.nodes) else: nodes = 2 log.error_count = 0 log.warning_count = 0 # validate resource configuration and get config resource = validate_resource_parameter_file(resource_name) # connect to resource rsh = connect_to_resource(resource) # do tests check_shell(rsh, resource) check_create_dirs(rsh, resource) # deploy inputs and sources copy_exec_sources_and_inputs(rsh, resource) # check that app.signature calculator on headnode check_appsig(rsh, resource) # close connection we don't need it any more rsh.close(force=True) del rsh # run test job to queue run_test_job(resource, app_name, nodes) if log.error_count == 0: append_to_bashrc(resource) enable_resource_for_execution(resource) log.empty_line() log.info("Result:") if log.error_count > 0: log.error("There are %d errors, fix them.", log.error_count) if log.warning_count > 0: log.warning( "There are %d warnings.\nif warnings have sense you can move to next step!\n", log.warning_count) if log.error_count == 0 and log.warning_count == 0: log.info("\nDONE, you can move to next step!\n")
def copy_exec_sources_and_inputs(rsh, resource): """Copy exec sources and inputs to remote resource""" log.info( "Preparing to copy application signature calculator,\n" " app. kernel input files and \n" " HPCC, IMB, IOR and Graph500 source code to remote resource\n") try: cfg.sshCommand(rsh, "cd %s" % resource['appKerDir']) out = cfg.sshCommand(rsh, "ls " + resource['appKerDir']) files_in_appker_dir = out.strip().split() if not ("inputs" in files_in_appker_dir or "inputs/" in files_in_appker_dir): log.info("Copying app. kernel input tarball to %s", resource['appKerDir']) if not dry_run: cfg.scpToResource(resource, cfg.appker_repo_dir + "/inputs.tar.gz", resource['appKerDir']) log.info("Unpacking app. kernel input files to %s/inputs", resource['appKerDir']) if not dry_run: out = cfg.sshCommand( rsh, "tar xvfz %s/inputs.tar.gz" % resource['appKerDir']) log.debug(out) out = cfg.sshCommand(rsh, "du -h %s/inputs" % resource['appKerDir']) log.debug(out) if out.count("No such file or directory") == 0: log.info("App. kernel input files are in %s/inputs\n", resource['appKerDir']) else: raise Exception("files are not copied!") else: log.warning_count += 1 log.warning( "WARNING %d: App. kernel inputs directory %s/inputs is present, assume they are correct.\n", log.warning_count, resource['appKerDir']) if not ("execs" in files_in_appker_dir or "execs/" in files_in_appker_dir): log.info( "Copying app. kernel execs tarball to %s\n" % (resource['appKerDir']) + "It contains HPCC,IMB,IOR and Graph500 source code and app.signature calculator" ) if not dry_run: cfg.scpToResource(resource, cfg.appker_repo_dir + "/execs.tar.gz", resource['appKerDir']) log.info( "Unpacking HPCC,IMB,IOR and Graph500 source code and app.signature calculator files to %s/execs", resource['appKerDir']) if not dry_run: out = cfg.sshCommand( rsh, "tar xvfz %s/execs.tar.gz" % resource['appKerDir']) log.debug(out) out = cfg.sshCommand(rsh, "df -h %s/execs" % resource['appKerDir']) log.debug(out) if out.count("No such file or directory") == 0: log.info( "HPCC,IMB,IOR and Graph500 source code and app.signature calculator are in %s/execs\n", resource['appKerDir']) else: raise Exception("files are not copied!") else: log.warning_count += 1 log.warning( "WARNING %d: App. kernel executables directory %s/execs is present, assume they are correct.", log.warning_count, resource['appKerDir']) log.warning( "It should contain HPCC,IMB,IOR and Graph500 source code and app.signature calculator\n" ) cfg.sshCommand(rsh, "rm execs.tar.gz inputs.tar.gz") except Exception as e: log.critical("Can not copy files to %s", resource['name']) raise e
def _remove_log_dir(): """remove mod_appkernel""" if cfg.akrr_log_dir is None: log.warning("akrr_log_dir is None") return _remove_dir(cfg.akrr_log_dir)
def resource_add(config): """add resource, config should have following members dry_run - Dry Run No files will actually be created minimalistic - Minimize questions number, configuration files will be edited manually no-ping - do not run ping to test headnode name verbose """ global verbose global dry_run global no_ping global minimalistic global resource_name global remoteAccessNode global remoteAccessMethod global remoteCopyMethod global sshUserName global sshPassword global sshPrivateKeyFile global sshPrivateKeyPassword global networkScratch global localScratch global akrrData global appKerDir global batchScheduler global batchJobHeaderTemplate if config.verbose: verbose = True log.info("Beginning Initiation of New Resource...") verbose = config.verbose dry_run = config.dry_run resource_deploy.dry_run = config.dry_run no_ping = config.no_ping minimalistic = config.minimalistic log.info("Retrieving Resources from XDMoD Database...") # RETRIEVE: the resources from XDMoD resources = retrieve_resources() log.info("Found following resources from XDMoD Database:\n" + " resource_id name\n" + "\n".join([ " %11d %-40s" % (resource_id, resource_name) for resource_name, resource_id in resources ]) + "\n") if len(resources) > 0: while True: log.log_input( 'Enter resource_id for import (enter 0 for no match):') resource_id = input() if validate_resource_id(resource_id, resources): break log.warning("Incorrect resource_id try again") log.empty_line() resource_id = int(resource_id) else: resource_id = 0 if resource_id <= 0: # i.e. no match from XDMoD DB resource_id = None resource_name = "" while True: if resource_id is None: log.log_input('Enter AKRR resource name:') resource_name = input() else: resource_name2 = get_resource_name_by_id(resource_id, resources) log.log_input( 'Enter AKRR resource name, hit enter to use same name as in XDMoD Database [%s]:' % (resource_name2, )) resource_name = input() if resource_name.strip() == "": resource_name = resource_name2 if validate_resource_name(resource_name): break log.empty_line() while True: log.log_input('Enter queuing system on resource (slurm or pbs): ') queuing_system = input() if validate_queuing_system(queuing_system): break else: log.error("Incorrect queuing_system try again") batchScheduler = queuing_system log.empty_line() if minimalistic is False: get_remote_access_method() get_system_characteristics() get_file_system_access_points() log.debug("Summary of parameters" + "resource_name: {}".format(resource_name) + "remoteAccessNode: {}".format(remoteAccessNode) + "remoteAccessMethod: {}".format(remoteAccessMethod) + "remoteCopyMethod: {}".format(remoteCopyMethod) + "sshUserName: {}".format(sshUserName) + "sshPassword: {}".format(sshPassword) + "sshPrivateKeyFile: {}".format(sshPrivateKeyFile) + "sshPrivateKeyPassword: {}".format(sshPrivateKeyPassword) + "networkScratch: {}".format(networkScratch) + "localScratch: {}".format(localScratch) + "akrrData: {}".format(akrrData) + "appKerDir: {}".format(appKerDir) + "batchScheduler: {}".format(batchScheduler) + "batchJobHeaderTemplate: {}".format(batchJobHeaderTemplate) + "\n") generate_resource_config(resource_id, resource_name, queuing_system) log.info("Initiation of new resource is completed.\n" " Edit batchJobHeaderTemplate variable in {}\n" " and move to resource validation and deployment step.\n" " i.e. execute:\n" " akrr resource deploy -r {}".format( resource_cfg_filename, resource_name))
def get_file_system_access_points(): global resource_name global networkScratch global localScratch global akrrData global appKerDir home_dir = cfg.sshCommand(rsh, "echo $HOME").strip() scratch_network_dir = cfg.sshCommand(rsh, "echo $SCRATCH").strip() # localScratch local_scratch_default = "/tmp" while True: log.log_input( "Enter location of local scratch (visible only to single node):") localScratch = input("[%s]" % local_scratch_default) if localScratch.strip() == "": localScratch = local_scratch_default status, msg = resource_deploy.check_dir_simple(rsh, localScratch) if status: log.info(msg) log.empty_line() break else: log.warning(msg) log.warning( 'local scratch might be have a different location on head node, so if it is by design it is ok' ) log.empty_line() break localScratch = cfg.sshCommand(rsh, "echo %s" % (localScratch, )).strip() # networkScratch network_scratch_default = "" if scratch_network_dir != "": network_scratch_default = scratch_network_dir network_scratch_visible = False while True: log.log_input( "Enter location of network scratch (visible only to all nodes)," "used for temporary storage of app kernel input/output:") if network_scratch_default != "": networkScratch = input("[%s]" % network_scratch_default) if networkScratch.strip() == "": networkScratch = network_scratch_default else: networkScratch = input("") if networkScratch == "": log.error("Incorrect value for networkScratch, try again") continue status, msg = resource_deploy.check_dir(rsh, networkScratch, exit_on_fail=False, try_to_create=True) if status: log.info(msg) network_scratch_visible = True log.empty_line() break else: log.warning(msg) break networkScratch = cfg.sshCommand(rsh, "echo %s" % (networkScratch, )).strip() # appKerDir appker_dir_default = os.path.join(home_dir, "appker", resource_name) while True: log.log_input( "Enter future location of app kernels input and executable files:") appKerDir = input("[%s]" % appker_dir_default) if appKerDir.strip() == "": appKerDir = appker_dir_default status, msg = resource_deploy.check_dir(rsh, appKerDir, exit_on_fail=False, try_to_create=True) if status: log.info(msg) log.empty_line() break else: log.error(msg) appKerDir = cfg.sshCommand(rsh, "echo %s" % (appKerDir, )).strip() # akrrData akrr_data_default = os.path.join(home_dir, "akrr_data", resource_name) if network_scratch_visible: akrr_data_default = os.path.join(networkScratch, "akrr_data", resource_name) while True: log.log_input( "Enter future locations for app kernels working directories (can or even should be on scratch space):" ) akrrData = input("[%s]" % akrr_data_default) if akrrData.strip() == "": akrrData = akrr_data_default status, msg = resource_deploy.check_dir(rsh, akrrData, exit_on_fail=False, try_to_create=True) if status: log.info(msg) log.empty_line() break else: log.error(msg) akrrData = cfg.sshCommand(rsh, "echo %s" % (akrrData, )).strip()
def get_remote_access_method(): global resource_name global remoteAccessNode global remoteAccessMethod global remoteCopyMethod global sshUserName global sshPassword global sshPassword4thisSession global sshPrivateKeyFile global sshPrivateKeyPassword global rsh global no_ping # set remoteAccessNode while True: log.log_input( "Enter Resource head node (access node) full name (e.g. headnode.somewhere.org):" ) remoteAccessNode = input("[%s] " % resource_name) if remoteAccessNode.strip() == "": remoteAccessNode = resource_name response = os.system("ping -c 1 -w2 " + remoteAccessNode + " > /dev/null 2>&1") if response == 0: break else: if no_ping: log.warning("Can not ping %s, but asked to ignore it.", remoteAccessNode) break log.error("Incorrect head node name (can not ping %s), try again", remoteAccessNode) # set sshUserName current_user = getpass.getuser() ask_for_user_name = True while True: if ask_for_user_name: log.log_input("Enter username for resource access:") sshUserName = input("[%s] " % current_user) if sshUserName.strip() == "": sshUserName = current_user current_user = sshUserName # check password-less access if sshPassword is None: log.info("Checking for password-less access") else: log.info("Checking for resource access") successfully_connected = check_connection_to_resource() if successfully_connected: if sshPassword is None: log.info("Can access resource without password") else: log.info("Can access resource") if successfully_connected is False: log.info("Can not access resource without password") action_list = [( "TryAgain", "The private and public keys was generated manually, right now. Try again." )] # check private keys user_home_dir = os.path.expanduser("~") private_keys = [ os.path.join(user_home_dir, '.ssh', f[:-4]) for f in os.listdir(os.path.join(user_home_dir, '.ssh')) if os.path.isfile(os.path.join(user_home_dir, '.ssh', f)) and f[-4:] == '.pub' and os.path.isfile(os.path.join(user_home_dir, '.ssh', f[:-4])) ] if len(private_keys) > 0: action_list.append(("UseExistingPrivateKey", "Use existing private and public key.")) action_list.append( ("GenNewKey", "Generate new private and public key.")) action_list.append(("UsePassword", "Use password directly.")) log.empty_line() log.info("Select authentication method:" + "\n".join([ "%3d %s" % (i, desc) for i, (_, desc) in enumerate(action_list) ])) while True: log.log_input("Select option from list above:") try: action = input("[2] ") if action.strip() == "": action = 2 else: action = int(action) if action < 0 or action >= len(action_list): raise ValueError() break except (ValueError, TypeError): log.error("Incorrect entry, try again.") # do the action log.empty_line() if action_list[action][0] == "TryAgain": continue if action_list[action][0] == "UsePassword": log.log_input("Enter password for %s@%s:" % (sshUserName, remoteAccessNode)) sshPassword = getpass.getpass("") ask_for_user_name = not ask_for_user_name continue if action_list[action][0] == "UseExistingPrivateKey": log.info("Available private keys:" + "\n".join( ["%3d %s" % (i, p) for i, p in enumerate(private_keys)])) while True: log.log_input("Select key number from list above:") try: i_key = input("") i_key = int(i_key) if i_key < 0 or i_key >= len(private_keys): raise ValueError() break except (ValueError, TypeError): log.error("Incorrect entry, try again.") sshPrivateKeyFile = private_keys[i_key] ask_for_user_name = not ask_for_user_name continue if action_list[action][0] == "GenNewKey": count = 0 while True: log.log_input( "Enter password for %s@%s (will be used only during this session):" % (sshUserName, remoteAccessNode)) sshPassword4thisSession = getpass.getpass("") sshPassword = sshPassword4thisSession if check_connection_to_resource(): break count += 1 if count >= 3: break sshPassword = None # generate keys log.log_input("Enter private key name:") sshPrivateKeyFile = input("[id_rsa_%s]" % resource_name) if sshPrivateKeyFile.strip() == "": sshPrivateKeyFile = "id_rsa_%s" % resource_name sshPrivateKeyFile = os.path.join(user_home_dir, '.ssh', sshPrivateKeyFile) log.log_input( "Enter passphrase for new key (leave empty for passwordless access):" ) sshPrivateKeyPassword = getpass.getpass("") os.system("ssh-keygen -t rsa -N \"%s\" -f %s" % (sshPrivateKeyPassword, sshPrivateKeyFile)) if sshPrivateKeyPassword.strip() == "": sshPrivateKeyPassword = None # copy keys cfg.sshAccess(remoteAccessNode, ssh='ssh-copy-id', username=sshUserName, password=sshPassword4thisSession, PrivateKeyFile=sshPrivateKeyFile, PrivateKeyPassword=None, logfile=sys.stdout, command='') ask_for_user_name = not ask_for_user_name continue if successfully_connected: break else: log.error("Incorrect resource access credential") if successfully_connected: log.empty_line() log.info("Connecting to " + resource_name) str_io = io.StringIO() try: sys.stdout = sys.stderr = str_io rsh = cfg.sshAccess(remoteAccessNode, ssh=remoteAccessMethod, username=sshUserName, password=sshPassword, PrivateKeyFile=sshPrivateKeyFile, PrivateKeyPassword=sshPrivateKeyPassword, logfile=sys.stdout, command=None) sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ except Exception as e: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ log.debug(str_io.getvalue()) raise e log.info(" Done") log.empty_line() return successfully_connected
def check_connection_to_resource(): """check the connection to remote resource.""" global remoteAccessNode global remoteAccessMethod global remoteCopyMethod global sshUserName global sshPassword global sshPassword4thisSession global sshPrivateKeyFile global sshPrivateKeyPassword successfully_connected = False passphrase_entrance_count = 0 authorize_key_count = 0 while True: # Try to connect str_io = io.StringIO() try: sys.stdout = sys.stderr = str_io cfg.sshAccess(remoteAccessNode, ssh=remoteAccessMethod, username=sshUserName, password=sshPassword, PrivateKeyFile=sshPrivateKeyFile, PrivateKeyPassword=sshPrivateKeyPassword, logfile=str_io, command='ls') sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ successfully_connected = True break except Exception: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ response = str_io.getvalue() log.debug( "Had attempted to access resource without password and failed, below is resource response" + "=" * 80 + str_io.getvalue() + "=" * 80) # check if it asking for passphrase m = re.search(r"Enter passphrase for key '(.*)':", response) if m: if passphrase_entrance_count >= 3: sshPrivateKeyPassword = None sshPrivateKeyFile = None break if passphrase_entrance_count > 0: log.error("Incorrect passphrase try again") sshPrivateKeyFile = m.group(1) log.log_input("Enter passphrase for key '%s':" % sshPrivateKeyFile) sshPrivateKeyPassword = getpass.getpass("") passphrase_entrance_count += 1 continue m2 = re.search(r"[pP]assword:", response) if m is None and sshPrivateKeyFile is not None and m2: log.warning( "Can not login to head node. " "Probably the public key of private key was not authorized on head node" ) log.info( "Will try to add public key to list of authorized keys on head node" ) while True: try: authorize_key_count += 1 log.log_input( "Enter password for %s@%s (will be used only during this session):" % (sshUserName, remoteAccessNode)) sshPassword4thisSession = getpass.getpass("") log.empty_line() str_io = io.StringIO() sys.stdout = sys.stderr = str_io cfg.sshAccess(remoteAccessNode, ssh='ssh-copy-id', username=sshUserName, password=sshPassword4thisSession, PrivateKeyFile=sshPrivateKeyFile, PrivateKeyPassword=None, logfile=str_io, command='') sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ log.info(response) log.info( "Have added public key to list of authorized keys on head node, " "will attempt to connect again.") log.empty_line() break except Exception: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ if verbose: log.debug( "Had attempted to add public key to list of authorized keys on head node and failed, " + "below is resource response" + "=" * 80 + str_io.getvalue() + "=" * 80) log.error("Incorrect password try again.") if authorize_key_count >= 3: break if authorize_key_count < 3: continue break return successfully_connected
def install_cron_scripts(self): """Install cron scripts.""" log.info("Installing cron entries") if dry_run: return if self.cron_email: mail = "MAILTO = " + self.cron_email else: mail = None restart = "50 23 * * * " + akrr_bin_dir + "/akrr daemon -cron restart" checknrestart = "33 * * * * " + akrr_bin_dir + "/akrr daemon -cron checknrestart" try: crontanContent = subprocess.check_output("crontab -l", shell=True) crontanContent = crontanContent.decode("utf-8").splitlines(True) except: log.info("Crontab does not have user's crontab yet") crontanContent = [] mailUpdated = False mailThere = False restartThere = False checknrestartThere = False for i in range(len(crontanContent)): l = crontanContent[i] if len(l.strip()) > 1 and l.strip()[0] != "#": m = re.match(r'^MAILTO\s*=\s*(.*)', l.strip()) if m: cron_email = m.group(1) cron_email = self.cron_email.replace('"', '') mailThere = True if self.cron_email != cron_email: if mail: crontanContent[i] = mail else: crontanContent[i] = "#" + crontanContent[i] mailUpdated = True if l.count("akrr") and l.count( "daemon") and l.count("restart") > 0: restartThere = True if l.count("akrr") and l.count( "daemon") and l.count("checknrestart") > 0: checknrestartThere = True if mailUpdated: log.info("Cron's MAILTO was updated") if ((self.cron_email != None and mailThere) or (self.cron_email == None and mailThere == False) ) and restartThere and checknrestartThere and mailUpdated == False: log.warning( "All AKRR crond entries found. No modifications necessary.") return if self.cron_email != None and mailThere == False: crontanContent.insert(0, mail + "\n") if restartThere == False: crontanContent.append(restart + "\n") if checknrestartThere == False: crontanContent.append(checknrestart + "\n") with open(os.path.expanduser('.crontmp'), 'w') as f: for l in crontanContent: f.write(l) subprocess.call("crontab .crontmp", shell=True) os.remove(".crontmp") log.info("Cron Scripts Processed!")