def verify_resource_params(resource: dict, warnings_as_exceptions: bool = False) -> dict: """ Perform simplistic resource.py parameters validation raises TypeError or NameError on problems """ global resource_renamed_parameters for old_key, new_key in resource_renamed_parameters: if old_key in resource: resource[new_key] = resource[old_key] if not warnings_as_exceptions: log.warning("Resource parameter {} was renamed to {}".format(old_key, new_key)) else: raise DeprecationWarning("Resource parameter {} was renamed to {}".format(old_key, new_key)) # @todo check string templates for deprecated variables global resource_parameters_types for variable, (m_type, nullable, must) in resource_parameters_types.items(): if (must is True) and (variable not in resource): raise NameError("Syntax error in " + resource['name'] + "\nVariable %s is not set" % (variable,)) if variable not in resource: continue if resource[variable] is None and not nullable: raise TypeError("Syntax error in " + resource['name'] + "\nVariable %s can not be None" % (variable,)) if not isinstance(resource[variable], m_type) and not (resource[variable] is None and nullable): raise TypeError("Syntax error in " + resource['name'] + "\nVariable %s should be %s" % (variable, str(m_type)) + ". But it is " + str(type(resource[variable]))) # level 2 parameters # check that parameters for presents and type # format: key,type,can be None,must have parameter parameters_types_2 = { 'remote_access_node': [str, True if resource['batch_scheduler'].lower() == "openstack" else False, True] } for variable, (m_type, nullable, must) in parameters_types_2.items(): if (must is True) and (variable not in resource): raise NameError("Syntax error in " + resource['name'] + "\nVariable %s is not set" % (variable,)) if variable not in resource: continue if resource[variable] is None and not nullable: raise TypeError("Syntax error in " + resource['name'] + "\nVariable %s can not be None" % (variable,)) if not isinstance(resource[variable], m_type) and not (resource[variable] is None and nullable): raise TypeError("Syntax error in " + resource['name'] + "\nVariable %s should be %s" % (variable, str(m_type)) + ". But it is " + str(type(resource[variable]))) # mapped parameters which still uses internally different name # these eventually should be renamed resource_renamed_parameters_internal_name = [ ] for old_key, new_key in resource_renamed_parameters_internal_name: if old_key in resource: resource[new_key] = resource[old_key] return resource
def verify_app_params(app: dict, app_on_resource: dict, warnings_as_exceptions: bool = False) -> dict: """ Perform simplistic app.py parameters validation raises error """ # mapped renamed parameters global app_renamed_parameters for old_key, new_key in app_renamed_parameters: if old_key in app_on_resource: app[new_key] = app_on_resource[old_key] app_on_resource[new_key] = app_on_resource[old_key] if not warnings_as_exceptions: log.warning("App parameter %s was renamed to %s", old_key, new_key) else: raise DeprecationWarning("App parameter {} was renamed to {}".format(old_key, new_key)) if old_key in app: app[new_key] = app[old_key] app_on_resource[new_key] = app[old_key] if not warnings_as_exceptions: log.warning("App parameter %s was renamed to %s", old_key, new_key) else: raise DeprecationWarning("App parameter {} was renamed to {}".format(old_key, new_key)) # check that parameters for presents and type # format: key,type,can be None,must have parameter parameters_types = [ ['parser', str, False, True], ['executable', str, True, True], ['input_param', str, True, True], ['walltime_limit', int, False, True], ['run_script', dict, False, False] ] for variable, m_type, nullable, must in parameters_types: if must and (variable not in app): raise NameError("Syntax error in " + app['name'] + "\nVariable %s is not set" % (variable,)) if variable not in app: continue if app[variable] is None and not nullable: raise TypeError("Syntax error in " + app['name'] + "\nVariable %s can not be None" % (variable,)) if not isinstance(app[variable], m_type) and not (app[variable] is None and nullable): raise TypeError("Syntax error in " + app['name'] + "\nVariable %s should be %s" % (variable, str(m_type)) + ". But it is " + str(type(app[variable]))) # mapped parameters which still uses internally different name # these eventually should be renamed renamed_parameters_internal_name = [ ] for old_key, new_key in renamed_parameters_internal_name: if old_key in app: app_on_resource[old_key] = app_on_resource[new_key] return app_on_resource
def enable_resource_for_execution(resource): """populate mod_appkernel database and allow execution of jobs on this resource""" if akrr.dry_run: return resource_name = resource['name'] try: con_ak, cur_ak = akrr.db.get_ak_db(True) cur_ak.execute('''SELECT * FROM resource WHERE nickname=%s''', (resource_name, )) resource_in_ak_db = cur_ak.fetchall() if len(resource_in_ak_db) == 0: log.warning( "There is no record of %s in mod_appkernel.resource will add one.", resource_name) cur_ak.execute( '''INSERT INTO resource (resource,nickname,description,enabled,visible) VALUES(%s,%s,%s,0,0);''', (resource['name'], resource['name'], resource['info'])) con_ak.commit() cur_ak.execute('''SELECT * FROM resource WHERE nickname=%s''', (resource_name, )) resource_in_ak_db = cur_ak.fetchall() resource_in_ak_db = resource_in_ak_db[0] # enable and make visible cur_ak.execute( '''UPDATE resource SET enabled=1,visible=1 WHERE resource_id=%s;''', (resource_in_ak_db['resource_id'], )) con_ak.commit() log.info( "Enabled %s in mod_appkernel.resource for tasks execution and made it visible to XDMoD UI.", resource_name) except MySQLdb.Error: log.error("Can not connect to AK DB\n" "Probably invalid credential") # enabling resource for execution try: r = akrrrestclient.put('/resources/' + resource_name + '/on') if r.status_code == 200: log.info('Successfully enabled ' + resource_name) else: log.error( "Can not enable resource through AKRR REST API ( %s )\nSee server response below\n%s", akrrrestclient.restapi_host, json.dumps(r.json(), indent=4)) except requests.RequestException: log.error( "Can not enable resource through AKRR REST API ( %s )\n" "Is it still running?\n", akrrrestclient.restapi_host)
def check_if_test_job_already_submitted(resource, app_name="test"): """check if the test job is already submitted, return task id if it is submitted""" task_id = None test_job_lock_filename = get_test_job_lock_filename(resource, app_name) if os.path.isfile(test_job_lock_filename): fin = open(test_job_lock_filename, "r") task_id = int(fin.readline()) fin.close() r = akrrrestclient.get('/tasks/' + str(task_id)) if r.status_code != 200: task_id = None else: log.warning_count += 1 log.warning( "\nWARNING %d: Seems this is rerun of this script, will monitor task with task_id = %d ", log.warning_count, task_id) log.warning("To submit new task delete %s\n", test_job_lock_filename) # check how old is it return task_id
def get_daemon_pid(akrr_pid_file, delete_pid_file_if_daemon_down=False): """ Return the PID of AKRR server """ import os import psutil pid = None if os.path.isfile(akrr_pid_file): fin = open(akrr_pid_file, "r") lines = fin.readlines() pid = int(lines[0]) fin.close() # Check For the existence of a unix pid if psutil.pid_exists(pid): try: cmd = " ".join(psutil.Process(pid=pid).cmdline()) if cmd.count('akrr') and cmd.count('daemon') and cmd.count( 'start'): return pid except Exception as e: log.log_traceback(str(e)) else: # if here means that previous session was crushed if delete_pid_file_if_daemon_down: log.warning( "WARNING:File %s exists meaning that the previous execution was finished incorrectly." "Removing pid file." % akrr_pid_file) os.remove(akrr_pid_file) return None else: raise IOError( "File %s exists meaning that the previous execution was finished incorrectly." % akrr_pid_file) return pid
def check_create_dirs(rsh, resource): log.info("Checking directory locations\n") d = resource['akrr_data'] log.info("Checking: %s:%s", resource['remote_access_node'], d) status, msg = check_dir(rsh, d, exit_on_fail=True, try_to_create=True) log.info(msg) d = resource['appkernel_dir'] log.info("Checking: %s:%s", resource['remote_access_node'], d) status, msg = check_dir(rsh, d, exit_on_fail=True, try_to_create=True) log.info(msg) d = resource['network_scratch'] log.info("Checking: %s:%s", resource['remote_access_node'], d) status, msg = check_dir(rsh, d, exit_on_fail=False, try_to_create=True) if status is True: log.info(msg) else: log.warning_count += 1 log.warning(msg) log.warning( "WARNING %d: network scratch might be have a different location on head node, " "so if it is by design it is ok", log.warning_count) d = resource['local_scratch'] log.info("Checking: %s:%s", resource['remote_access_node'], d) status, msg = check_dir(rsh, d, exit_on_fail=False, try_to_create=False) if status is True: log.info(msg) else: log.warning_count += 1 log.warning(msg) log.warning( "WARNING %d: local scratch might be have a different location on head node, " "so if it is by design it is ok", log.warning_count) log.empty_line()
def load_app_on_resource(app_name: str, resource_name: str, resource: Dict, app: Dict, app_on_resource_cfg_filename: str = None, validate: bool = True) -> Dict: """ load app configuration for the resource file, do minimalistic validation return dict with app parameters raises error if can not load """ log.debug("Loading app %s", app_name) from akrr.util import exec_files_to_dict try: # load resource specific parameters if app_on_resource_cfg_filename is None: app_on_resource_cfg_filename = os.path.join( cfg_dir, "resources", resource_name, app_name + ".app.conf") if not os.path.isfile(app_on_resource_cfg_filename): # raise error because a specific app on resource was asked if app['need_resource_specific_conf']: raise AkrrError( "application kernel configuration file do not exists (%s)!" % app_on_resource_cfg_filename) else: return {} # init default app_on_resource = copy.deepcopy( app['appkernel_on_resource']['default']) if 'name' not in app_on_resource: app_on_resource['name'] = app_name if 'nickname' not in app_on_resource: app_on_resource['nickname'] = app_name + ".@nnodes@" # set execution_method from resource config execution_method = resource.get("execution_method", "hpc") # set execution_method from app on resource config execution_method = _get_app_execution_method( app_on_resource_cfg_filename, default=execution_method) # read default config app_on_resource_cfg_default = os.path.join( default_dir, "%s.%s.app.conf" % (app_name, execution_method)) if os.path.isfile(app_on_resource_cfg_default): app_on_resource = exec_files_to_dict(app_on_resource_cfg_default, var_in=app_on_resource) elif execution_method != "hpc": log.warning("%s doen't have default for %s execution method" % (app_name, execution_method)) # read resource specific configuration app_on_resource[ 'resource_specific_app_cfg_filename'] = app_on_resource_cfg_filename app_on_resource['resource_specific_app_cfg_file_last_mod_time'] = 0 if os.path.isfile(app_on_resource_cfg_filename): app_on_resource = exec_files_to_dict(app_on_resource_cfg_filename, var_in=app_on_resource) app_on_resource['resource_specific_app_cfg_file_last_mod_time'] = \ os.path.getmtime(app_on_resource_cfg_filename) # validation combined config if validate: app_combined = {} app_combined.update(resource) app_combined.update(app) app_combined.update(app_on_resource) app_on_resource = verify_app_params(app_combined, app_on_resource) return app_on_resource except Exception: log.exception( "Exception occurred during app kernel configuration loading for %s." % app_name) raise AkrrError("Can not load app configuration for %s." % app_name)
def resource_add(config): """add resource, config should have following members dry_run - Dry Run No files will actually be created minimalistic - Minimize questions number, configuration files will be edited manually no-ping - do not run ping to test headnode name verbose """ global verbose global no_ping global minimalistic global resource_name global remote_access_node global remote_access_method global remote_copy_method global ssh_username global ssh_password global ssh_private_key_file global ssh_private_key_password global network_scratch global local_scratch global akrr_data global appkernel_dir global batch_scheduler global batch_job_header_template if config.verbose: verbose = True log.info("Beginning Initiation of New Resource...") verbose = config.verbose akrr.dry_run = config.dry_run no_ping = config.no_ping minimalistic = config.minimalistic log.info("Retrieving Resources from XDMoD Database...") # RETRIEVE: the resources from XDMoD resources = retrieve_resources_from_xdmod() log.info("Found following resources from XDMoD Database:\n" + " resource_id name\n" + "\n".join([ " %11d %-40s" % (resource_id, resource_name) for resource_name, resource_id in resources ]) + "\n") if len(resources) > 0: while True: log.log_input( 'Enter resource_id for import (enter 0 for no match):') resource_id = input() if validate_resource_id(resource_id, resources): break log.warning("Incorrect resource_id try again") log.empty_line() resource_id = int(resource_id) else: resource_id = 0 if resource_id <= 0: # i.e. no match from XDMoD DB resource_id = None resource_name = "" while True: if resource_id is None: log.log_input('Enter AKRR resource name:') resource_name = input() else: resource_name2 = get_resource_name_by_id(resource_id, resources) log.log_input( 'Enter AKRR resource name, hit enter to use same name as in XDMoD Database [%s]:' % (resource_name2, )) resource_name = input() if resource_name.strip() == "": resource_name = resource_name2 if validate_resource_name(resource_name): break log.empty_line() while True: log.log_input( 'Enter queuing system on resource (slurm, pbs or openstack): ') queuing_system = input() if validate_queuing_system(queuing_system): break else: log.error("Incorrect queuing_system try again") batch_scheduler = queuing_system log.empty_line() if minimalistic is False: get_remote_access_method() get_system_characteristics() get_file_system_access_points() log.debug( "Summary of parameters" + "resource_name: {}".format(resource_name) + "remote_access_node: {}".format(remote_access_node) + "remote_access_method: {}".format(remote_access_method) + "remote_copy_method: {}".format(remote_copy_method) + "ssh_username: {}".format(ssh_username) + "ssh_password: {}".format(ssh_password) + "ssh_private_key_file: {}".format(ssh_private_key_file) + "ssh_private_key_password: {}".format(ssh_private_key_password) + "network_scratch: {}".format(network_scratch) + "local_scratch: {}".format(local_scratch) + "akrr_data: {}".format(akrr_data) + "appkernel_dir: {}".format(appkernel_dir) + "batch_scheduler: {}".format(batch_scheduler) + "batch_job_header_template: {}".format(batch_job_header_template) + "\n") generate_resource_config(resource_id, resource_name, queuing_system) log.info("Initiation of new resource is completed.\n" " Edit batch_job_header_template variable in {}\n" " and move to resource validation and deployment step.\n" " i.e. execute:\n" " akrr resource deploy -r {}".format( resource_cfg_filename, resource_name))
def get_file_system_access_points(): global resource_name global network_scratch global local_scratch global akrr_data global appkernel_dir home_dir = akrr.util.ssh.ssh_command(rsh, "echo $HOME").strip() scratch_network_dir = akrr.util.ssh.ssh_command(rsh, "echo $SCRATCH").strip() # local_scratch local_scratch_default = "/tmp" while True: log.log_input( "Enter location of local scratch (visible only to single node):") local_scratch = input("[%s]" % local_scratch_default) if local_scratch.strip() == "": local_scratch = local_scratch_default status, msg = check_dir_simple(rsh, local_scratch) if status: log.info(msg) log.empty_line() break else: log.warning(msg) log.warning( 'local scratch might be have a different location on head node, so if it is by design it is ok' ) log.empty_line() break local_scratch = akrr.util.ssh.ssh_command(rsh, "echo %s" % (local_scratch, )).strip() # network_scratch network_scratch_default = "" if scratch_network_dir != "": network_scratch_default = scratch_network_dir network_scratch_visible = False while True: log.log_input( "Enter location of network scratch (visible only to all nodes)," "used for temporary storage of app kernel input/output:") if network_scratch_default != "": network_scratch = input("[%s]" % network_scratch_default) if network_scratch.strip() == "": network_scratch = network_scratch_default else: network_scratch = input("") if network_scratch == "": log.error("Incorrect value for network_scratch, try again") continue status, msg = check_dir(rsh, network_scratch, exit_on_fail=False, try_to_create=True) if status: log.info(msg) network_scratch_visible = True log.empty_line() break else: log.warning(msg) break network_scratch = akrr.util.ssh.ssh_command( rsh, "echo %s" % (network_scratch, )).strip() # appkernel_dir appker_dir_default = os.path.join(home_dir, "appker", resource_name) while True: log.log_input( "Enter future location of app kernels input and executable files:") appkernel_dir = input("[%s]" % appker_dir_default) if appkernel_dir.strip() == "": appkernel_dir = appker_dir_default status, msg = check_dir(rsh, appkernel_dir, exit_on_fail=False, try_to_create=True) if status: log.info(msg) log.empty_line() break else: log.error(msg) appkernel_dir = akrr.util.ssh.ssh_command(rsh, "echo %s" % (appkernel_dir, )).strip() # akrr_data akrr_data_default = os.path.join(home_dir, "akrr_data", resource_name) if network_scratch_visible: akrr_data_default = os.path.join(network_scratch, "akrr_data", resource_name) while True: log.log_input( "Enter future locations for app kernels working directories (can or even should be on scratch space):" ) akrr_data = input("[%s]" % akrr_data_default) if akrr_data.strip() == "": akrr_data = akrr_data_default status, msg = check_dir(rsh, akrr_data, exit_on_fail=False, try_to_create=True) if status: log.info(msg) log.empty_line() break else: log.error(msg) akrr_data = akrr.util.ssh.ssh_command(rsh, "echo %s" % (akrr_data, )).strip()
def generate_batch_job_script(self): if self.JobScriptName is None: self.JobScriptName = self.get_job_script_name() # get walltime from DB db_defaults = {} try: db, cur = akrr.db.get_akrr_db() cur.execute('''SELECT resource,app,resource_param,app_param FROM active_tasks WHERE task_id=%s ;''', (self.task_id,)) raw = cur.fetchall() if len(raw) > 0: (resource, app, resource_param, app_param) = raw[0] cur.execute("""SELECT walltime_limit FROM akrr_default_walllimit WHERE resource=%s AND app=%s AND resource_param=%s AND app_param=%s """, (resource, app, resource_param, app_param)) raw = cur.fetchall() if len(raw) > 0: db_defaults['walltime_limit'] = raw[0][0] # db.commit() cur.close() del db except Exception as e: pass raise e # create job-script try: batch_vars = {} appkernel_on_resource = {} if 'appkernel_on_resource' in self.app: if self.resourceName in self.app['appkernel_on_resource']: appkernel_on_resource = self.app['appkernel_on_resource'][self.resourceName] elif 'default' in self.app['appkernel_on_resource']: appkernel_on_resource = self.app['appkernel_on_resource']['default'] for di in [self.resource, self.app, appkernel_on_resource, db_defaults, self.resourceParam, self.appParam]: batch_vars.update(di) # get auto-walltime limit try: if 'auto_walltime_limit' in batch_vars and batch_vars['auto_walltime_limit'] is True: log.info("auto_walltime_limit is on, trying to estimate walltime limit...") auto_walltime_limit_overhead = 1.2 if 'auto_walltime_limit_overhead' in batch_vars: auto_walltime_limit_overhead = batch_vars['auto_walltime_limit_overhead'] + 1.0 # query last 20 executions of this appkernel on that resource with that node count db, cur = akrr.db.get_akrr_db(True) cur.execute('''SELECT resource,reporter,reporternickname,collected,status,walltime FROM akrr_xdmod_instanceinfo WHERE `resource`=%s AND `reporternickname` = %s ORDER BY `akrr_xdmod_instanceinfo`.`collected` DESC LIMIT 0 , 20''', (self.resource['name'], "%s.%d" % (self.app['name'], batch_vars['nnodes']))) raw = cur.fetchall() i = 0 last_five_runs_successfull = True max_walltime = 0.0 for r in raw: if i < 5 and r['status'] == 0: last_five_runs_successfull = False if r['status'] == 1 and r['walltime'] > max_walltime: max_walltime = r['walltime'] i += 1 if i < 5: log.info("There are only %d previous run, need at least 5 for walltime limit autoset" % i) else: if not last_five_runs_successfull: log.warning("One of last 5 runs have failed. Would not use autoset.") else: if max_walltime < 120: log.info("Previous walltime was less than 2 minutes, will set walltime limit to 2 minutes") max_walltime = 120 batch_vars['walltime_limit'] = 2 else: log.info( "Max walltime was %.1f s, will change walltime limit from %.1f minutes to %d minutes" % (max_walltime, batch_vars['walltime_limit'], int(auto_walltime_limit_overhead * max_walltime / 60.0 + 0.99))) batch_vars['walltime_limit'] = int((auto_walltime_limit_overhead * max_walltime / 60.0 + 0.99)) cur.close() del db except Exception as e: log.error("Exception happened in AkrrTaskHandlerAppKer.generate_batch_job_script: %s" % str(e)) # calculate NNodes and NCores if 'nnodes' in batch_vars: tmp_num_nodes = batch_vars['nnodes'] tmp_num_cores = tmp_num_nodes * batch_vars['ppn'] else: tmp_num_cores = batch_vars['ncores'] if tmp_num_cores % batch_vars['ppn'] == 0: tmp_num_nodes = tmp_num_cores / batch_vars['ppn'] else: tmp_num_nodes = (tmp_num_cores / batch_vars['ppn']) + 1 assert isinstance(tmp_num_nodes, int) assert isinstance(tmp_num_cores, int) batch_vars['akrr_num_of_cores'] = tmp_num_cores batch_vars['akrr_num_of_nodes'] = tmp_num_nodes # Set batch_vars remaps batch_vars['akrr_ppn'] = batch_vars['ppn'] batch_vars['akrrNCoresToBorder'] = batch_vars['akrr_ppn'] * batch_vars['akrr_num_of_nodes'] batch_vars['akrr_task_work_dir'] = self.remoteTaskDir batch_vars['akrr_walltime_limit'] = "%02d:%02d:00" % ( int(batch_vars['walltime_limit']) / 60, int(batch_vars['walltime_limit']) % 60) batch_vars['akrr_appkernel_name'] = self.app['name'] batch_vars['akrr_resource_name'] = self.resource['name'] batch_vars['akrr_time_stamp'] = self.timeStamp if batch_vars['akrr_num_of_nodes'] == 1: batch_vars['akrrPPN4NodesOrCores4OneNode'] = batch_vars['akrr_num_of_cores'] else: batch_vars['akrrPPN4NodesOrCores4OneNode'] = batch_vars['akrr_ppn'] if 'node_list_setter_template' not in batch_vars: batch_vars['node_list_setter_template'] = batch_vars['node_list_setter'][batch_vars['batch_scheduler']] # process templates batch_vars['akrrCommonCommands'] = akrr.util.format_recursively( batch_vars['akrr_common_commands_template'], batch_vars, keep_double_brackets=True) batch_vars['akrrCommonCleanup'] = akrr.util.format_recursively( batch_vars['akrr_common_cleanup_template'], batch_vars, keep_double_brackets=True) # specially for IOR request two nodes for single node benchmark, one for read and one for write if batch_vars['appkernel_requests_two_nodes_for_one'] is True and batch_vars['akrr_num_of_nodes'] == 1 and \ 'batch_job_header_template' in batch_vars: batch_vars2 = copy.deepcopy(batch_vars) batch_vars2['akrr_num_of_cores'] = 2 * batch_vars['akrr_num_of_cores'] batch_vars2['akrr_num_of_nodes'] = 2 * batch_vars['akrr_num_of_nodes'] batch_vars2['akrrNCoresToBorder'] = 2 * batch_vars['akrrNCoresToBorder'] batch_vars2['akrrPPN4NodesOrCores4OneNode'] = batch_vars['akrr_ppn'] batch_vars['batch_job_header_template'] = akrr.util.format_recursively( batch_vars2['batch_job_header_template'], batch_vars2) # do parameters adjustment if 'process_params' in batch_vars: batch_vars['process_params'](batch_vars) # generate job script job_script = akrr.util.format_recursively(self.resource["batch_job_template"], batch_vars) job_script_full_path = os.path.join(self.taskDir, "jobfiles", self.JobScriptName) fout = open(job_script_full_path, "w") fout.write(job_script) fout.close() except Exception as e: self.status = "ERROR: Can not created batch job script" self.status_info = traceback.format_exc() akrr.util.log.log_traceback(self.status) raise e
def install_cron_scripts(self): """ Install cron scripts. """ log.info("Installing cron entries") if akrr.dry_run: return if self.cron_email: mail = "MAILTO = " + self.cron_email else: mail = None restart = '50 23 * * * bash -l -c "' + _akrr_bin_dir + '/akrr daemon restart -cron"' check_and_restart = '33 * * * * bash -l "' + _akrr_bin_dir + '/akrr daemon checknrestart -cron"' archive = '43 1 * * * bash -l -c "' + _akrr_bin_dir + '/akrr archive -cron"' try: crontab_content = subprocess.check_output("crontab -l", shell=True) crontab_content = crontab_content.decode("utf-8").splitlines(True) except Exception: log.info("Crontab does not have user's crontab yet") crontab_content = [] mail_updated = False mail_there = False restart_there = False check_and_restart_there = False archive_there = False for i in range(len(crontab_content)): tmpstr = crontab_content[i] if len(tmpstr.strip()) > 1 and tmpstr.strip()[0] != "#": m = re.match(r'^MAILTO\s*=\s*(.*)', tmpstr.strip()) if m: cron_email = m.group(1) cron_email = cron_email.replace('"', '') mail_there = True if self.cron_email != cron_email: if mail: crontab_content[i] = mail else: crontab_content[i] = "#" + crontab_content[i] mail_updated = True if tmpstr.count("akrr") and tmpstr.count( "daemon") and tmpstr.count("restart") > 0: restart_there = True if tmpstr.count("akrr") and tmpstr.count( "daemon") and tmpstr.count("checknrestart") > 0: check_and_restart_there = True if tmpstr.count("akrr") and tmpstr.count( "daemon") and tmpstr.count("archive") > 0: archive_there = True if mail_updated: log.info("Cron's MAILTO was updated") if ((self.cron_email is not None and mail_there) or ( self.cron_email is None and mail_there is False)) and restart_there and check_and_restart_there \ and mail_updated is False: log.warning( "All AKRR crond entries found. No modifications necessary.") return if self.cron_email is not None and mail_there is False: crontab_content.insert(0, mail + "\n") if restart_there is False: crontab_content.append(restart + "\n") if check_and_restart_there is False: crontab_content.append(check_and_restart + "\n") if archive_there is False: crontab_content.append(archive + "\n") tmp_cronfile_fd, tmp_cronfile = mkstemp(prefix="crontmp", dir=os.path.expanduser('~'), text=True) if not akrr.dry_run: with open(tmp_cronfile_fd, 'wt') as f: for tmp_str in crontab_content: f.write(tmp_str) subprocess.call("crontab " + tmp_cronfile, shell=True) os.remove(tmp_cronfile) log.info("Crontab updated.") else: log.dry_run("For removing old AKRR should update crontab to:\n" + "".join(crontab_content))
def app_validate(resource, appkernel, nnodes): from akrr.util.log import verbose resource_name = resource app_name = appkernel error_count = 0 warning_count = 0 log.info("Validating " + app_name + " application kernel installation on " + resource_name) from akrr import get_akrr_dirs akrr_dirs = get_akrr_dirs() default_resource_param_filename = os.path.abspath( os.path.join(akrr_dirs['default_dir'], "default.resource.conf")) resource_param_filename = os.path.abspath( os.path.join(akrr_dirs['cfg_dir'], "resources", resource_name, "resource.conf")) default_app_param_filename = os.path.abspath( os.path.join(akrr_dirs['default_dir'], "default.app.conf")) app_ker_param_filename = os.path.abspath( os.path.join(akrr_dirs['default_dir'], app_name + ".app.conf")) ############################################################################################### # validating resource parameter file log.info("#" * 80) log.info("Validating %s parameters from %s" % (resource_name, resource_param_filename)) if not os.path.isfile(resource_param_filename): log.error("resource parameters file (%s) do not exists!" % (resource_param_filename, )) exit(1) # check syntax try: tmp = {} exec( compile( open(default_resource_param_filename).read(), default_resource_param_filename, 'exec'), tmp) exec( compile( open(resource_param_filename).read(), resource_param_filename, 'exec'), tmp) except Exception: log.exception("Can not load resource from " "" + resource_param_filename + "\n" + "Probably invalid syntax.") exit(1) # check syntax try: tmp = {} exec( compile( open(default_app_param_filename).read(), default_app_param_filename, 'exec'), tmp) exec( compile( open(app_ker_param_filename).read(), app_ker_param_filename, 'exec'), tmp) except Exception: log.exception("Can not load application kernel from " "" + app_ker_param_filename + "\n" + "Probably invalid syntax") exit(1) # now we can load akrr from akrr import cfg from akrr import akrrrestclient from akrr.cli.resource_deploy import make_results_summary from akrr.cfg_util import load_app_default, load_app_on_resource resource = cfg.find_resource_by_name(resource_name) log.info( "Syntax of %s is correct and all necessary parameters are present." % resource_param_filename) cfg.find_app_by_name(app_name) try: app_default = load_app_default(app_name) app = load_app_on_resource(app_name, resource_name, resource, app_default) pprint.pprint(app) except Exception as e: # pylint: disable=broad-except log.exception("Exception occurred during updated app loading:" + str(e)) exit(1) log.info( "Syntax of %s is correct and all necessary parameters are present." % app_ker_param_filename) # check if AK is in DB if True: # add entry to mod_appkernel.resource db_ak, cur_ak = akrr.db.get_ak_db(True) cur_ak.execute( '''SELECT * FROM app_kernel_def WHERE ak_base_name=%s''', (app_name, )) ak_in_akdb = cur_ak.fetchall() if len(ak_in_akdb) == 0: cur_ak.execute( '''INSERT INTO app_kernel_def (name,ak_base_name,processor_unit,enabled, description, visible) VALUES(%s,%s,'node',0,%s,0);''', (app_name, app_name, app_name)) db_ak.commit() cur_ak.execute( '''SELECT * FROM app_kernel_def WHERE ak_base_name=%s''', (app_name, )) ak_in_akdb = cur_ak.fetchall()[0] # add entry to mod_akrr.resource db, cur = akrr.db.get_akrr_db(True) cur.execute('''SELECT * FROM app_kernels WHERE name=%s''', (app_name, )) ak_in_db = cur.fetchall() if len(ak_in_db) == 0: cur.execute( '''INSERT INTO app_kernels (id,name,enabled,nodes_list) VALUES(%s,%s,0,'1,2,4,8');''', (ak_in_akdb['ak_def_id'], app_name)) db.commit() ############################################################################################### # connect to resource log.info("#" * 80) log.info("Validating resource accessibility. Connecting to %s." % (resource['name'])) if resource['ssh_private_key_file'] is not None and os.path.isfile( resource['ssh_private_key_file']) is False: log.error("Can not access ssh private key (%s)" "" % (resource['ssh_private_key_file'], )) exit(1) str_io = io.StringIO() try: sys.stdout = sys.stderr = str_io # Connect to resource # Spin-up instance before ssh it if resource['batch_scheduler'].lower() == "openstack": # Start instance if it is cloud openstack_server = akrr.util.openstack.OpenStackServer( resource=resource) resource['openstack_server'] = openstack_server openstack_server.create() resource['remote_access_node'] = openstack_server.ip if resource['batch_scheduler'].lower() == "googlecloud": # Start instance if it is cloud googlecloud_server = akrr.util.googlecloud.GoogleCloudServer( resource=resource) resource['googlecloud_server'] = googlecloud_server googlecloud_server.create() resource['remote_access_node'] = googlecloud_server.ip rsh = akrr.util.ssh.ssh_resource(resource) sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ except Exception as e: msg2 = str_io.getvalue() msg2 += "\n" + traceback.format_exc() sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ msg = "Can not connect to """ + resource['name'] + "\n" + \ "Probably invalid credential, see full error report below\n" + msg2 log.error(msg) raise e print("=" * 80) log.info("Successfully connected to %s\n\n" % (resource['name'])) ############################################################################################### log.info("Checking directory locations\n") d = resource['akrr_data'] log.info("Checking: %s:%s" % (resource['remote_access_node'], d)) status, msg = check_dir(rsh, d, exit_on_fail=True, try_to_create=True) log.info(msg + "\n") d = resource['appkernel_dir'] log.info("Checking: %s:%s" % (resource['remote_access_node'], d)) status, msg = check_dir(rsh, d, exit_on_fail=True, try_to_create=True) log.info(msg + "\n") d = resource['network_scratch'] log.info("Checking: %s:%s" % (resource['remote_access_node'], d)) status, msg = check_dir(rsh, d, exit_on_fail=False, try_to_create=False) if status is True: log.info(msg) else: log.warning(msg) log.warning( ("WARNING %d: network scratch might be have a different location " + "on head node, so if it is by design it is ok") % (warning_count + 1)) warning_count += 1 log.info("") d = resource['local_scratch'] log.info("Checking: %s:%s" % (resource['remote_access_node'], d)) status, msg = check_dir(rsh, d, exit_on_fail=False, try_to_create=False) if status is True: log.info(msg) else: log.warning(msg) log.warning( ("WARNING %d: local scratch might be have a different location " + "on head node, so if it is by design it is ok") % (warning_count + 1)) warning_count += 1 log.info("") # close connection we don't need it any more rsh.close(force=True) del rsh # Delete openstack instance after tests if resource['batch_scheduler'].lower() == "openstack": # delete instance if it is cloud resource['openstack_server'].delete() resource['remote_access_node'] = None if resource['batch_scheduler'].lower() == "googlecloud": # delete instance if it is cloud resource['googlecloud_server'].delete() resource['remote_access_node'] = None ############################################################################################### # send test job to queue log.info("#" * 80) log.info( "Will send test job to queue, wait till it executed and will analyze the output" ) print("Will use AKRR REST API at", akrrrestclient.restapi_host) # get check connection try: r = akrrrestclient.get('/scheduled_tasks') if r.status_code != 200: log.error( "Can not get token for AKRR REST API ( " "" + akrrrestclient.restapi_host + " )\n" + "See server response below:\n %s", json.dumps(r.json(), indent=4)) exit(1) except Exception: log.error("Can not connect to AKRR REST API ( " "" + akrrrestclient.restapi_host + " )\n" + "Is it running?\n" + "See full error report below:\n" + traceback.format_exc()) exit(1) # check if the test job is already submitted task_id = None test_job_lock_filename = os.path.join( cfg.data_dir, resource_name + "_" + app_name + "_test_task.dat") if os.path.isfile(test_job_lock_filename): fin = open(test_job_lock_filename, "r") task_id = int(fin.readline()) fin.close() r = akrrrestclient.get('/tasks/' + str(task_id)) if r.status_code != 200: task_id = None else: log.warning( "\nWARNING %d: Seems this is rerun of this script, will monitor task with task_id = " % (warning_count + 1) + str(task_id)) log.warning("To submit new task delete " + test_job_lock_filename + "\n") warning_count += 1 # check how old is it # submit test job if task_id is None: try: payload = { 'resource': resource_name, 'app': app_name, 'resource_param': "{'nnodes':%d}" % nnodes, 'task_param': "{'test_run':True}" } r = akrrrestclient.post('/scheduled_tasks', data=payload) if r.status_code != 200: log.error( "Can not submit task through AKRR REST API ( " "" + akrrrestclient.restapi_host + " )\n" + "See server response below", json.dumps(r.json(), indent=4)) exit(1) task_id = r.json()['data']['data']['task_id'] except Exception: log.error("Can not submit task through AKRR REST API ( " "" + akrrrestclient.restapi_host + " )\n" + "Is it still running?\n" + "See full error report below:\n" + traceback.format_exc()) exit(1) # write file with tast_id fout = open(os.path.join(test_job_lock_filename), "w") print(task_id, file=fout) fout.close() log.info("\nSubmitted test job to AKRR, task_id is " + str(task_id) + "\n") # now wait till job is done msg_body0 = "" while True: t = datetime.datetime.now() # try: r = akrrrestclient.get('/tasks/' + str(task_id)) if r.status_code == 200: response_json = r.json() msg_body = "=" * 80 msg_body += "\nTast status:\n" if response_json["data"]["queue"] == "scheduled_tasks": msg_body += "Task is in scheduled_tasks queue.\n" msg_body += "It schedule to be started on " + response_json[ "data"]["data"]['time_to_start'] + "\n" elif response_json["data"]["queue"] == "active_tasks": msg_body += "Task is in active_tasks queue.\n" msg_body += "Status: " + str( response_json["data"]["data"]['status']) + "\n" msg_body += "Status info:\n" + str( response_json["data"]["data"]['status_info']) + "\n" elif response_json["data"]["queue"] == "completed_tasks": msg_body += "Task is completed!\n" completed_tasks = r.json()['data']['data']['completed_tasks'] akrr_xdmod_instanceinfo = r.json( )['data']['data']['akrr_xdmod_instanceinfo'] if verbose: msg_body += "completed_tasks table entry:\n" + pp.pformat( completed_tasks) + "\n" msg_body += "akrr_xdmod_instanceinfo table entry:\n" + pp.pformat( akrr_xdmod_instanceinfo) + "\n" msg_body += 'output parsing results:\n' + akrr_xdmod_instanceinfo[ 'body'] + "\n" else: msg_body += "\tstatus: " + str( akrr_xdmod_instanceinfo['status']) + "\n" if akrr_xdmod_instanceinfo['status'] == 0: msg_body += "\tstatus2: " + completed_tasks[ 'status'] + "\n" msg_body += "\tstatus_info: " + completed_tasks[ 'status_info'] + "\n" else: msg_body += r.text + "\n" tail_msg = "time: " + t.strftime("%Y-%m-%d %H:%M:%S") if msg_body != msg_body0: print("\n\n" + msg_body) print(tail_msg, end=' ') sys.stdout.flush() else: print("\r" + tail_msg, end=' ') sys.stdout.flush() msg_body0 = copy.deepcopy(msg_body) if response_json["data"]["queue"] == "completed_tasks": break # try to update: try: payload = {'next_check_time': ''} akrrrestclient.put('/active_tasks/' + str(task_id), data=payload) except Exception: pass time.sleep(5) ############################################################################################### # analysing the output log.info("Test job is completed analyzing output\n") r = akrrrestclient.get('/tasks/' + str(task_id)) if r.status_code != 200: log.error( "Can not get information about task\n" + "See full error report below", "AKRR server response:\n" + r.text) exit(1) completed_tasks = r.json()['data']['data']['completed_tasks'] akrr_xdmod_instanceinfo = r.json( )['data']['data']['akrr_xdmod_instanceinfo'] akrr_errmsg = r.json()['data']['data']['akrr_errmsg'] results_summary = make_results_summary(resource_name, app_name, completed_tasks, akrr_xdmod_instanceinfo, akrr_errmsg) # execution was not successful if completed_tasks['status'].count("ERROR") > 0: if completed_tasks['status'].count( "ERROR Can not created batch job script and submit it to remote queue" ) > 0: log.error( "Can not created batch job script and/or submit it to remote queue\n" + "See full error report below:\n" + results_summary) os.remove(test_job_lock_filename) exit(1) else: log.error(completed_tasks['status'] + "\n" + "See full error report below:\n" + results_summary) os.remove(test_job_lock_filename) exit(1) # execution was not successful if akrr_xdmod_instanceinfo['status'] == 0: log.error("Task execution was not successful\n" + "See full error report below:\n" + results_summary) os.remove(test_job_lock_filename) exit(1) # see what is in report elm_perf = XMLElementTree.fromstring(akrr_xdmod_instanceinfo['body']) elm_perf.find('benchmark').find('parameters') elm_perf.find('benchmark').find('statistics') log.info("\nTest kernel execution summary:") print(results_summary) print() # log.info("\nThe output looks good.\n") if error_count == 0: # enabling resource for execution log.info("\nEnabling %s on %s for execution\n" % (app_name, resource_name)) try: result = akrrrestclient.put('/resources/%s/on' % (resource_name, ), data={'application': app_name}) if result.status_code == 200: log.info("Successfully enabled %s on %s" % (app_name, resource_name)) else: if result is not None: log.error( "Can not turn-on %s on %s" % (app_name, resource_name), result.text) else: log.error("Can not turn-on %s on %s" % (app_name, resource_name)) exit(1) if True: # add entry to mod_appkernel.resource db_ak, cur_ak = akrr.db.get_ak_db(True) cur_ak.execute( '''SELECT * FROM app_kernel_def WHERE ak_base_name=%s''', (app_name, )) ak_in_akdb = cur_ak.fetchall() if len(ak_in_akdb) == 0: cur_ak.execute( "INSERT INTO app_kernel_def (name,ak_base_name,processor_unit,enabled, description, visible)" "VALUES(%s,%s,'node',0,%s,0);", (app_name, app_name, app_name)) db_ak.commit() cur_ak.execute( '''UPDATE app_kernel_def SET enabled=1,visible=1 WHERE ak_base_name=%s''', (app_name, )) db_ak.commit() # add entry to mod_akrr.resource db, cur = akrr.db.get_akrr_db(True) cur.execute('''SELECT * FROM app_kernels WHERE name=%s''', (app_name, )) ak_in_db = cur.fetchall() if len(ak_in_db) == 0: cur.execute( '''INSERT INTO app_kernels (id,name,enabled,nodes_list) VALUES(%s,%s,0,'1,2,4,8');''', (ak_in_akdb['ak_def_id'], app_name)) db.commit() cur.execute( '''UPDATE app_kernels SET enabled=1 WHERE name=%s''', (app_name, )) db.commit() except Exception: log.exception("Can not turn-on %s on %s", app_name, resource_name) exit(1) if error_count > 0: log.error("There are %d errors, fix them.", error_count) if warning_count > 0: log.warning( "\nThere are %d warnings.\nif warnings have sense (highlighted in yellow), you can move to next step!\n" % warning_count) if error_count == 0 and warning_count == 0: log.info("\nDONE, you can move to next step!\n") os.remove(test_job_lock_filename)
def _remove_log_dir(): """remove mod_appkernel""" if cfg.akrr_log_dir is None: log.warning("akrr_log_dir is None") return _remove_dir(cfg.akrr_log_dir)
def run_it(_): log.warning("add_command_install is not implemented")
def run_it(_): log.warning("add_command_build is not implemented")
def copy_exec_sources_and_inputs(rsh, resource): """Copy exec sources and inputs to remote resource""" log.info( "Preparing to copy application signature calculator,\n" " app. kernel input files and \n" " HPCC, IMB, IOR and Graph500 source code to remote resource\n") try: akrr.util.ssh.ssh_command(rsh, "cd %s" % resource['appkernel_dir']) out = akrr.util.ssh.ssh_command(rsh, "ls " + resource['appkernel_dir']) files_in_appker_dir = out.strip().split() if not ("inputs" in files_in_appker_dir or "inputs/" in files_in_appker_dir): log.info("Copying app. kernel input tarball to %s", resource['appkernel_dir']) if not akrr.dry_run: akrr.util.ssh.scp_to_resource( resource, cfg.appker_repo_dir + "/inputs.tar.gz", resource['appkernel_dir']) log.info("Unpacking app. kernel input files to %s/inputs", resource['appkernel_dir']) if not akrr.dry_run: out = akrr.util.ssh.ssh_command( rsh, "tar xvfz %s/inputs.tar.gz" % resource['appkernel_dir']) log.debug(out) out = akrr.util.ssh.ssh_command( rsh, "du -h %s/inputs" % resource['appkernel_dir']) log.debug(out) if out.count("No such file or directory") == 0: log.info("App. kernel input files are in %s/inputs\n", resource['appkernel_dir']) else: raise Exception("files are not copied!") else: log.warning_count += 1 log.warning( "WARNING %d: App. kernel inputs directory %s/inputs is present, assume they are correct.\n", log.warning_count, resource['appkernel_dir']) if not ("execs" in files_in_appker_dir or "execs/" in files_in_appker_dir): log.info( "Copying app. kernel execs tarball to %s\n" % (resource['appkernel_dir']) + "It contains HPCC,IMB,IOR and Graph500 source code and app.signature calculator" ) if not akrr.dry_run: akrr.util.ssh.scp_to_resource( resource, cfg.appker_repo_dir + "/execs.tar.gz", resource['appkernel_dir']) log.info( "Unpacking HPCC,IMB,IOR and Graph500 source code and app.signature calculator files to %s/execs", resource['appkernel_dir']) if not akrr.dry_run: out = akrr.util.ssh.ssh_command( rsh, "tar xvfz %s/execs.tar.gz" % resource['appkernel_dir']) log.debug(out) out = akrr.util.ssh.ssh_command( rsh, "df -h %s/execs" % resource['appkernel_dir']) log.debug(out) if out.count("No such file or directory") == 0: log.info( "HPCC,IMB,IOR and Graph500 source code and app.signature calculator are in %s/execs\n", resource['appkernel_dir']) else: raise Exception("files are not copied!") else: log.warning_count += 1 log.warning( "WARNING %d: App. kernel executables directory %s/execs is present, assume they are correct.", log.warning_count, resource['appkernel_dir']) log.warning( "It should contain HPCC,IMB,IOR and Graph500 source code and app.signature calculator\n" ) akrr.util.ssh.ssh_command(rsh, "rm execs.tar.gz inputs.tar.gz") except Exception as e: log.critical("Can not copy files to %s", resource['name']) raise e
def resource_add(config): """add resource, config should have following members dry_run - Dry Run No files will actually be created minimalistic - Minimize questions number, configuration files will be edited manually no-ping - do not run ping to test headnode name verbose """ global verbose global no_ping global minimalistic global resource_name global remote_access_node global remote_access_method global remote_copy_method global ssh_username global ssh_password global ssh_private_key_file global ssh_private_key_password global network_scratch global local_scratch global akrr_data global appkernel_dir global batch_scheduler global batch_job_header_template if config.verbose: verbose = True log.info("Beginning Initiation of New Resource...") verbose = config.verbose akrr.dry_run = config.dry_run no_ping = config.no_ping minimalistic = config.minimalistic log.info("Retrieving Resources from XDMoD Database...") # RETRIEVE: the resources from XDMoD resources = retrieve_resources_from_xdmod() log.info("Found following resources from XDMoD Database:\n" + " resource_id name\n" + "\n".join([ " %11d %-40s" % (resource_id, resource_name) for resource_name, resource_id in resources ]) + "\n") resource_id = None if len(resources) > 0: while True: log.log_input( 'Enter resource_id for import (enter None for no match):') resource_id = input() if validate_resource_id(resource_id, resources): break log.warning("Incorrect resource_id try again") log.empty_line() if resource_id != "None": resource_id = int(resource_id) else: resource_id = None resource_name = ask.ask('Enter AKRR resource name', validate=validate_resource_name, default=None if resource_id is None else get_resource_name_by_id(resource_id, resources)) batch_scheduler = ask.multiple_choice_enum( 'Enter queuing system on resource', QueuingSystemType).value if minimalistic is False: if batch_scheduler is QueuingSystemType.openstack.value: _get_openstack_details() get_system_characteristics() elif batch_scheduler is QueuingSystemType.googlecloud.value: _get_googlecloud_details() get_system_characteristics() else: get_remote_access_method() get_system_characteristics() get_file_system_access_points() log.debug( "Summary of parameters" + "resource_name: {}".format(resource_name) + "remote_access_node: {}".format(remote_access_node) + "remote_access_method: {}".format(remote_access_method) + "remote_copy_method: {}".format(remote_copy_method) + "ssh_username: {}".format(ssh_username) + "ssh_password: {}".format(ssh_password) + "ssh_private_key_file: {}".format(ssh_private_key_file) + "ssh_private_key_password: {}".format(ssh_private_key_password) + "network_scratch: {}".format(network_scratch) + "local_scratch: {}".format(local_scratch) + "akrr_data: {}".format(akrr_data) + "appkernel_dir: {}".format(appkernel_dir) + "batch_scheduler: {}".format(batch_scheduler) + "batch_job_header_template: {}".format(batch_job_header_template) + "\n") generate_resource_config(resource_id, resource_name, batch_scheduler) log.info("Initiation of new resource is completed.\n" " Edit batch_job_header_template variable in {}\n" " and move to resource validation and deployment step.\n" " i.e. execute:\n" " akrr resource deploy -r {}".format( resource_cfg_filename, resource_name))
def check_connection_to_resource(): """check the connection to remote resource.""" global remote_access_node global remote_access_method global remote_copy_method global ssh_username global ssh_password global ssh_password4thisSession global ssh_private_key_file global ssh_private_key_password successfully_connected = False passphrase_entrance_count = 0 authorize_key_count = 0 while True: # Try to connect str_io = io.StringIO() try: sys.stdout = sys.stderr = str_io akrr.util.ssh.ssh_access( remote_access_node, ssh=remote_access_method, username=ssh_username, password=ssh_password, private_key_file=ssh_private_key_file, private_key_password=ssh_private_key_password, logfile=str_io, command='ls') sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ successfully_connected = True break except Exception: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ response = str_io.getvalue() log.debug( "Had attempted to access resource without password and failed, below is resource response" + "=" * 80 + str_io.getvalue() + "=" * 80) # check if it asking for passphrase m = re.search(r"Enter passphrase for key '(.*)':", response) if m: if passphrase_entrance_count >= 3: ssh_private_key_password = None ssh_private_key_file = None break if passphrase_entrance_count > 0: log.error("Incorrect passphrase try again") ssh_private_key_file = m.group(1) log.log_input("Enter passphrase for key '%s':" % ssh_private_key_file) ssh_private_key_password = getpass.getpass("") passphrase_entrance_count += 1 continue m2 = re.search(r"[pP]assword:", response) if m is None and ssh_private_key_file is not None and m2: log.warning( "Can not login to head node. " "Probably the public key of private key was not authorized on head node" ) log.info( "Will try to add public key to list of authorized keys on head node" ) while True: try: authorize_key_count += 1 log.log_input( "Enter password for %s@%s (will be used only during this session):" % (ssh_username, remote_access_node)) ssh_password4thisSession = getpass.getpass("") log.empty_line() str_io = io.StringIO() sys.stdout = sys.stderr = str_io akrr.util.ssh.ssh_access( remote_access_node, ssh='ssh-copy-id', username=ssh_username, password=ssh_password4thisSession, private_key_file=ssh_private_key_file, private_key_password=None, logfile=str_io, command='') sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ log.info(response) log.info( "Have added public key to list of authorized keys on head node, " "will attempt to connect again.") log.empty_line() break except Exception: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ if verbose: log.debug( "Had attempted to add public key to list of authorized keys on head node and failed, " + "below is resource response" + "=" * 80 + str_io.getvalue() + "=" * 80) log.error("Incorrect password try again.") if authorize_key_count >= 3: break if authorize_key_count < 3: continue break return successfully_connected
def resource_deploy(args): global checking_frequency resource_name = args.resource if 'dry_run' in args: akrr.dry_run = args.dry_run else: akrr.dry_run = False if "checking_frequency" in args: checking_frequency = args.checking_frequency if "appkernel" in args: app_name = args.appkernel else: app_name = "test" if "nodes" in args: nodes = int(args.nodes) else: nodes = 2 log.error_count = 0 log.warning_count = 0 # validate resource configuration and get config resource = validate_resource_parameter_file(resource_name) # connect to resource if resource['batch_scheduler'].lower() == "openstack": # Start instance if it is cloud openstack_server = akrr.util.openstack.OpenStackServer( resource=resource) resource['openstack_server'] = openstack_server openstack_server.create() resource['remote_access_node'] = openstack_server.ip rsh = connect_to_resource(resource) # do tests check_shell(rsh, resource) check_create_dirs(rsh, resource) # deploy inputs and sources copy_exec_sources_and_inputs(rsh, resource) # check that app.signature calculator on headnode check_appsig(rsh, resource) # close connection we don't need it any more rsh.close(force=True) del rsh if resource['batch_scheduler'].lower() == "openstack": # delete instance if it is cloud akrr.util.openstack.OpenStackServer(resource=resource) resource['openstack_server'].delete() resource['remote_access_node'] = None # run test job to queue run_test_job(resource, app_name, nodes) if resource['batch_scheduler'].lower() == "openstack": # Start instance if it is cloud openstack_server = akrr.util.openstack.OpenStackServer( resource=resource) resource['openstack_server'] = openstack_server openstack_server.create() resource['remote_access_node'] = openstack_server.ip if log.error_count == 0: append_to_bashrc(resource) enable_resource_for_execution(resource) if resource['batch_scheduler'].lower() == "openstack": # delete instance if it is cloud akrr.util.openstack.OpenStackServer(resource=resource) resource['openstack_server'].delete() resource['remote_access_node'] = None log.empty_line() log.info("Result:") if log.error_count > 0: log.error("There are %d errors, fix them.", log.error_count) if log.warning_count > 0: log.warning( "There are %d warnings.\nif warnings have sense you can move to next step!\n", log.warning_count) if log.error_count == 0 and log.warning_count == 0: log.info("\nDONE, you can move to next step!\n")
def get_remote_access_method(): global resource_name global remote_access_node global remote_access_method global remote_copy_method global ssh_username global ssh_password global ssh_password4thisSession global ssh_private_key_file global ssh_private_key_password global rsh global no_ping # set remote_access_node while True: log.log_input( "Enter Resource head node (access node) full name (e.g. headnode.somewhere.org):" ) remote_access_node = input("[%s] " % resource_name) if remote_access_node.strip() == "": remote_access_node = resource_name response = os.system("ping -c 1 -w2 " + remote_access_node + " > /dev/null 2>&1") if response == 0: break else: if no_ping: log.warning("Can not ping %s, but asked to ignore it.", remote_access_node) break log.error("Incorrect head node name (can not ping %s), try again", remote_access_node) # set ssh_username current_user = getpass.getuser() ask_for_user_name = True while True: if ask_for_user_name: log.log_input("Enter username for resource access:") ssh_username = input("[%s] " % current_user) if ssh_username.strip() == "": ssh_username = current_user current_user = ssh_username # check password-less access if ssh_password is None: log.info("Checking for password-less access") else: log.info("Checking for resource access") successfully_connected = check_connection_to_resource() if successfully_connected: if ssh_password is None: log.info("Can access resource without password") else: log.info("Can access resource") if successfully_connected is False: log.info("Can not access resource without password") action_list = [( "TryAgain", "The private and public keys was generated manually, right now. Try again." )] # check private keys user_home_dir = os.path.expanduser("~") user_ssh_dir = os.path.join(user_home_dir, '.ssh') if os.path.isdir(user_ssh_dir): private_keys = [ os.path.join(user_ssh_dir, f[:-4]) for f in os.listdir(user_ssh_dir) if os.path.isfile(os.path.join(user_ssh_dir, f)) and f[-4:] == '.pub' and os.path.isfile(os.path.join(user_ssh_dir, f[:-4])) ] else: private_keys = [] if len(private_keys) > 0: action_list.append(("UseExistingPrivateKey", "Use existing private and public key.")) default_action = len(action_list) action_list.append( ("GenNewKey", "Generate new private and public key.")) action_list.append(("UsePassword", "Use password directly.")) log.empty_line() log.info("Select authentication method:\n" + "\n".join([ "%3d %s" % (i, desc) for i, (_, desc) in enumerate(action_list) ])) while True: log.log_input("Select option from list above:") try: action = input("[%s] " % default_action) if action.strip() == "": action = default_action else: action = int(action) if action < 0 or action >= len(action_list): raise ValueError() break except (ValueError, TypeError): log.error("Incorrect entry, try again.") # do the action log.empty_line() if action_list[action][0] == "TryAgain": continue if action_list[action][0] == "UsePassword": log.log_input("Enter password for %s@%s:" % (ssh_username, remote_access_node)) ssh_password = getpass.getpass("") ask_for_user_name = not ask_for_user_name continue if action_list[action][0] == "UseExistingPrivateKey": log.info("Available private keys:" + "\n".join( ["%3d %s" % (i, p) for i, p in enumerate(private_keys)])) while True: log.log_input("Select key number from list above:") try: i_key = input("") i_key = int(i_key) if i_key < 0 or i_key >= len(private_keys): raise ValueError() break except (ValueError, TypeError): log.error("Incorrect entry, try again.") ssh_private_key_file = private_keys[i_key] ask_for_user_name = not ask_for_user_name continue if action_list[action][0] == "GenNewKey": count = 0 while True: log.log_input( "Enter password for %s@%s (will be used only during this session):" % (ssh_username, remote_access_node)) ssh_password4thisSession = getpass.getpass("") ssh_password = ssh_password4thisSession if check_connection_to_resource(): break count += 1 if count >= 3: break # generate keys log.log_input("Enter private key name:") ssh_private_key_file = input("[id_rsa_%s]" % resource_name) if ssh_private_key_file.strip() == "": ssh_private_key_file = "id_rsa_%s" % resource_name ssh_private_key_file = os.path.join(user_home_dir, '.ssh', ssh_private_key_file) log.log_input( "Enter passphrase for new key (leave empty for passwordless access):" ) ssh_private_key_password = getpass.getpass("") if akrr.dry_run: successfully_connected = True else: ssh_password = None os.system("ssh-keygen -t rsa -N \"%s\" -f %s" % (ssh_private_key_password, ssh_private_key_file)) if ssh_private_key_password.strip() == "": ssh_private_key_password = None # copy keys akrr.util.ssh.ssh_access( remote_access_node, ssh='ssh-copy-id', username=ssh_username, password=ssh_password4thisSession, private_key_file=ssh_private_key_file, private_key_password=None, logfile=sys.stdout, command='') ask_for_user_name = not ask_for_user_name continue if successfully_connected: break else: log.error("Incorrect resource access credential") if successfully_connected: log.empty_line() log.info("Connecting to " + resource_name) str_io = io.StringIO() try: sys.stdout = sys.stderr = str_io rsh = akrr.util.ssh.ssh_access( remote_access_node, ssh=remote_access_method, username=ssh_username, password=ssh_password, private_key_file=ssh_private_key_file, private_key_password=ssh_private_key_password, logfile=sys.stdout, command=None) sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ except Exception as e: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ log.debug(str_io.getvalue()) raise e log.info(" Done") log.empty_line() return successfully_connected
def run(self, akrr_db: str = None, ak_db: str = None, xd_db: str = None, install_cron_scripts: bool = True, stand_alone: bool = False, akrr_home: str = None, generate_db_only: bool = False, update: bool = False, old_akrr_home: str = None, skip_update_completed_dirs=False, skip_update_db=False, skip_saving_db_for_update=False): """ Setup or update AKRR Parameters ---------- akrr_db: if none will use localhost:3306 ak_db: if none will use ak_db xd_db: if none will use xd_db install_cron_scripts: install cron scripts stand_alone: run without XDMoD update: update current akrr installation akrr_home: custom location of akrr home generate_db_only: only generate DB update: perform update from previous stable version old_akrr_home: location of old AKRR home for update """ hints_to_finish_update = "" if update: self.update = akrr.update.UpdateAKRR(old_akrr_home) # Set initial db conf if not update: if akrr_db is None: akrr_db = self.default_akrr_db # if ak_db and xd_db is not set use akrr_db if ak_db is None: ak_db = akrr_db if xd_db is None: xd_db = akrr_db else: if akrr_db is None: # i.e. not set, use default akrr_db = set_user_password_host_port_db( self.update.old_cfg['akrr_db_user'], self.update.old_cfg['akrr_db_passwd'], self.update.old_cfg['akrr_db_host'], self.update.old_cfg['akrr_db_port'], self.update.old_cfg['akrr_db_name']) if ak_db is None: ak_db = set_user_password_host_port_db( self.update.old_cfg['ak_db_user'], self.update.old_cfg['ak_db_passwd'], self.update.old_cfg['ak_db_host'], self.update.old_cfg['ak_db_port'], self.update.old_cfg['ak_db_name']) if xd_db is None: xd_db = set_user_password_host_port_db( self.update.old_cfg['xd_db_user'], self.update.old_cfg['xd_db_passwd'], self.update.old_cfg['xd_db_host'], self.update.old_cfg['xd_db_port'], self.update.old_cfg['xd_db_name']) # Get db details self.akrr_db_user_name, self.akrr_db_user_password, self.akrr_db_host, self.akrr_db_port, self.akrr_db_name = \ get_user_password_host_port_db(akrr_db, default_database="mod_akrr") self.ak_db_user_name, self.ak_db_user_password, self.ak_db_host, self.ak_db_port, self.ak_db_name = \ get_user_password_host_port_db(ak_db, default_database="mod_appkernel") self.xd_db_user_name, self.xd_db_user_password, self.xd_db_host, self.xd_db_port, self.xd_db_name = \ get_user_password_host_port_db(xd_db, default_database="modw") self.stand_alone = stand_alone self.generate_db_only = generate_db_only self.install_cron_scripts_flag = install_cron_scripts self.akrr_home_dir = akrr_home # check self.check_utils() # get directories layout global _akrr_dirs, _akrr_home, _akrr_cfg self._initial_akrr_dirs = _akrr_dirs self._akrr_dirs = akrr.get_akrr_dirs(self.akrr_home_dir) _akrr_dirs = self._akrr_dirs _akrr_home = _akrr_dirs["akrr_home"] _akrr_cfg = _akrr_dirs["akrr_cfg"] if self.update: # require that old and new akrr home was different if _akrr_dirs == self.update.old_akrr_home: log.error( "Old and new akrr home directories should be different. Rename old akrr home.\n" + "\tOld AKRR home: %s\n\tNew AKRR home: %s", self.update.old_akrr_home, _akrr_dirs) exit(1) # shut down old daemon, remove it from cron and update DB self.update.remove_old_akrr_from_crontab() self.update.shut_down_old_akrr() if not self.update: # check previous installation self.check_previous_installation() # set installation directory self.init_dir() if not self.update: # ask info self.read_db_user_credentials() if self.install_cron_scripts_flag and not self.generate_db_only: self.ask_cron_email() # if it is dry_run # all question are asked, this is dry run, so nothing else to do") self.init_mysql_dbs() self.generate_self_signed_certificate() cfg = self.generate_settings_file() if self.update: # copy old logs if not skip_update_completed_dirs: akrr.update.UpdateCompletedDirs( self.update.old_cfg["completed_tasks_dir"], cfg["completed_tasks_dir"]).run() # update DB if not skip_update_db: akrr.update.UpdateDataBase(self.update).update( skip_saving_db_for_update=skip_saving_db_for_update) # update config files for resources and appkernels hints_to_finish_update = akrr.update.UpdateResourceAppConfigs( self.update).update() self.set_permission_on_files() self.db_check() if not self.update: self.generate_tables() if self.generate_db_only: log.info("AKRR DB Generated") return self.update_bashrc() self.start_daemon() self.check_daemon() if self.install_cron_scripts_flag: self.install_cron_scripts() log.info("AKRR is set up and is running.") if self.update: log.warning( "Below are instructions to finish conversion " + "(shell commands, execute them manually one by one ensure correct run):\n" + hints_to_finish_update)
def install_cron_scripts(self): """ Install cron scripts. """ log.info("Installing cron entries") if dry_run: return if self.cron_email: mail = "MAILTO = " + self.cron_email else: mail = None restart = "50 23 * * * " + akrr_bin_dir + "/akrr daemon -cron restart" check_and_restart = "33 * * * * " + akrr_bin_dir + "/akrr daemon -cron checknrestart" try: crontan_content = subprocess.check_output("crontab -l", shell=True) crontan_content = crontan_content.decode("utf-8").splitlines(True) except Exception: log.info("Crontab does not have user's crontab yet") crontan_content = [] mail_updated = False mail_there = False restart_there = False check_and_restart_there = False for i in range(len(crontan_content)): tmpstr = crontan_content[i] if len(tmpstr.strip()) > 1 and tmpstr.strip()[0] != "#": m = re.match(r'^MAILTO\s*=\s*(.*)', tmpstr.strip()) if m: cron_email = m.group(1) cron_email = cron_email.replace('"', '') mail_there = True if self.cron_email != cron_email: if mail: crontan_content[i] = mail else: crontan_content[i] = "#" + crontan_content[i] mail_updated = True if tmpstr.count("akrr") and tmpstr.count("daemon") and tmpstr.count("restart") > 0: restart_there = True if tmpstr.count("akrr") and tmpstr.count("daemon") and tmpstr.count("checknrestart") > 0: check_and_restart_there = True if mail_updated: log.info("Cron's MAILTO was updated") if ((self.cron_email is not None and mail_there) or ( self.cron_email is None and mail_there is False)) and restart_there and check_and_restart_there \ and mail_updated is False: log.warning("All AKRR crond entries found. No modifications necessary.") return if self.cron_email is not None and mail_there is False: crontan_content.insert(0, mail + "\n") if restart_there is False: crontan_content.append(restart + "\n") if check_and_restart_there is False: crontan_content.append(check_and_restart + "\n") with open(os.path.expanduser('.crontmp'), 'w') as f: for tmpstr in crontan_content: f.write(tmpstr) subprocess.call("crontab .crontmp", shell=True) os.remove(".crontmp") log.info("Cron Scripts Processed!")