def runcmd(self, cmd, clearSpecialSymbols=True, printOutput=False, addAfter=False): self.sendline(cmd) self.expect(self.prompt) self.lastcmd = cmd + "\n" output = self.getCmdOutput(clearSpecialSymbols=clearSpecialSymbols, addAfter=addAfter, replaceCMD=False) if hasattr(self, 'output'): self.output += output if printOutput: if self.echo: log.info(output) else: log.info("command: `{}` output: \n{}".format(cmd, output)) sys.stdout.flush() else: if self.echo: log.debug2(output) else: log.debug2("command: `{}` output: \n{}".format(cmd, output)) sys.stdout.flush() return output
def load_all_resources(): """ load all resources from configuration directory """ global resources # pylint: disable=global-statement for resource_name in os.listdir(os.path.join(cfg_dir, "resources")): if resource_name not in ['notactive', 'templates']: log.debug2("loading "+resource_name) try: resource = load_resource(resource_name) resources[resource_name] = resource except Exception as e: # pylint: disable=broad-except log.exception("Exception occurred during resources loading:"+str(e))
def remove_tasks_state_dumps(self, days_old: int, resources=None, appkernels=None) -> None: """ remove tasks state dumps """ resources = get_list_from_comma_sep_values(resources) appkernels = get_list_from_comma_sep_values(appkernels) log.info("Removing tasks state dumps") log.debug("resources filter: " + str(resources)) log.debug("appkernels filter: " + str(appkernels)) log.debug("days: " + str(days_old)) log.debug("dry_run: " + str(self.dry_run)) log.debug("comp_task_dir: " + str(self.comp_task_dir)) timenow = datetime.datetime.now() seconds_in_day = 24 * 3600 count = 0 for task_dir in self.get_tasks_dir_list(resources, appkernels): try: time_stamp = os.path.basename(task_dir) activate_time = time_stamp_to_datetime(time_stamp) days_passed = (timenow - activate_time).total_seconds() / seconds_in_day if days_passed < days_old: continue proc_dir = os.path.join(task_dir, "proc") if not os.path.isdir(proc_dir): continue for state_file in os.listdir(proc_dir): if _state_dump.match(state_file) is None: continue log.debug2(" delete:", state_file) state_file_fullpath = os.path.join(proc_dir, state_file) count += 1 if not self.dry_run: os.remove(state_file_fullpath) except: log.error("Cannot process: " + task_dir) log.info("Removed %d task state dumps" % count)
def remove( db_akrr=False, db_appkernel=False, db_modw=False, db_user=False, conf_dir=False, log_dir=False, bashrc=False, crontab=False, crontab_remove_mailto=False, **kwargs): log.debug( "Removal options for removal:\n" " db_akrr: {}\n" " db_appkernel: {}\n" " db_modw: {}\n" " db_user: {}\n" " conf_dir: {}\n" " log_dir: {}\n" " bashrc: {}\n" " crontab: {} , crontab_remove_mailto: {}\n" "".format( db_akrr, db_appkernel, db_modw, db_user, conf_dir, log_dir, bashrc, crontab, crontab_remove_mailto) ) _stop_akrr() log.debug2("Unused keyword arguments: {}".format(kwargs)) if db_user: _remove_user() if db_akrr: _remove_akrr_db() if db_appkernel: _remove_ak_db() if db_modw: _remove_modw_db() if conf_dir: _remove_conf_dir() if log_dir: _remove_log_dir() if bashrc: _remove_from_bashrc() if crontab: _remove_from_crontab(crontab_remove_mailto)
def make_dirs(path, verbose=True): """ Recursively create directories if not in dry run mode """ import akrr from akrr import akrrerror import os from akrr.util import log if not akrr.dry_run: if os.path.isdir(path): if verbose: log.debug2("Directory %s already exists.", path) elif not os.path.exists(path): if verbose: log.debug2("Creating directory: {}".format(path)) os.makedirs(path, mode=0o755) else: raise akrrerror.AkrrError( "Can not create directory %s, because it exists and is not directory" % path) else: log.dry_run("make_dirs(%s)", path)
def generate_settings_file(self) -> dict: """ Generate configuration (akrr.conf) file Return dictionary with configuration """ log.info("Generating configuration file ...") with open(os.path.join(_akrr_mod_dir, 'templates', 'akrr.conf'), 'r') as f: akrr_inp_template = f.read() if not self.update: restapi_rw_password = self.get_random_password() restapi_ro_password = self.get_random_password() else: restapi_rw_password = self.update.old_cfg['restapi_rw_password'] restapi_ro_password = self.update.old_cfg['restapi_ro_password'] cfg = { 'akrr_db_host': '"%s"' % self.akrr_db_host, 'akrr_db_port': '%s' % str(self.akrr_db_port), 'akrr_db_user_name': '"%s"' % self.akrr_db_user_name, 'akrr_db_user_password': '******' % self.akrr_db_user_password, 'akrr_db_name': '"%s"' % self.akrr_db_name, 'ak_db_name': '"%s"' % self.ak_db_name, 'xd_db_name': '"%s"' % self.xd_db_name, 'restapi_host': "localhost", 'restapi_port': 8091, 'restapi_apiroot': '/api/v1', 'restapi_certfile': 'server.pem', 'restapi_token_expiration_time': 3600, 'restapi_rw_username': '******', 'restapi_rw_password': restapi_rw_password, 'restapi_ro_username': '******', 'restapi_ro_password': restapi_ro_password, 'data_dir': "../log/data", 'completed_tasks_dir': "../log/comptasks", 'max_task_handlers': 4, 'task_pickling_protocol': 0, 'scheduled_tasks_loop_sleep_time': 1.0, 'max_fatal_errors_for_task': 10, 'active_task_default_attempt_repeat': 'datetime.timedelta(minutes=30)', 'max_wall_time_for_task_handlers': 'datetime.timedelta(minutes=30)', 'repeat_after_forcible_termination': 'active_task_default_attempt_repeat', 'max_fails_to_submit_to_the_queue': 48, 'repeat_after_fails_to_submit_to_the_queue': 'datetime.timedelta(hours=1)', 'max_time_in_queue': 'datetime.timedelta(days=10)', 'export_db_repeat_attempt_in': 'datetime.timedelta(hours=1)', 'export_db_max_repeat_attempts': 48, 'default_task_params': "{'test_run': False}", 'akrr_version': akrrversion } if self.akrr_db_host == self.ak_db_host and self.akrr_db_port == self.ak_db_port and \ self.akrr_db_user_name == self.ak_db_user_name and \ self.akrr_db_user_password == self.ak_db_user_password: cfg.update({ 'ak_db_host': 'akrr_db_host', 'ak_db_port': 'akrr_db_port', 'ak_db_user_name': 'akrr_db_user', 'ak_db_user_password': '******' }) else: cfg.update({ 'ak_db_host': '"%s"' % self.ak_db_host, 'ak_db_port': '%s' % str(self.ak_db_port), 'ak_db_user_name': '"%s"' % self.ak_db_user_name, 'ak_db_user_password': '******' % self.ak_db_user_password }) if self.xd_db_host == self.akrr_db_host and self.xd_db_port == self.akrr_db_port and \ self.xd_db_user_name == self.akrr_db_user_name and \ self.xd_db_user_password == self.akrr_db_user_password: cfg.update({ 'xd_db_host': 'akrr_db_host', 'xd_db_port': 'akrr_db_port', 'xd_db_user_name': 'akrr_db_user', 'xd_db_user_password': '******', }) elif self.xd_db_host == self.ak_db_host and self.xd_db_port == self.ak_db_port and \ self.xd_db_user_name == self.ak_db_user_name and \ self.xd_db_user_password == self.ak_db_user_password: cfg.update({ 'xd_db_host': 'ak_db_host', 'xd_db_port': 'ak_db_port', 'xd_db_user_name': 'ak_db_user', 'xd_db_user_password': '******', }) else: cfg.update({ 'xd_db_host': '"%s"' % self.xd_db_host, 'xd_db_port': '%s' % str(self.xd_db_port), 'xd_db_user_name': '"%s"' % self.xd_db_user_name, 'xd_db_user_password': '******' % self.xd_db_user_password, }) if self.update: cfg['restapi_host'] = self.update.old_cfg['restapi_host'] cfg['restapi_port'] = self.update.old_cfg['restapi_port'] cfg['restapi_apiroot'] = self.update.old_cfg['restapi_apiroot'] cfg['restapi_certfile'] = self.update.old_cfg['restapi_certfile'] cfg['restapi_token_expiration_time'] = self.update.old_cfg[ 'restapi_token_expiration_time'] cfg['restapi_rw_username'] = self.update.old_cfg[ 'restapi_rw_username'] cfg['restapi_rw_password'] = self.update.old_cfg[ 'restapi_rw_password'] cfg['restapi_ro_username'] = self.update.old_cfg[ 'restapi_ro_username'] cfg['restapi_ro_password'] = self.update.old_cfg[ 'restapi_ro_password'] #cfg['data_dir'] = self.update.old_cfg['data_dir'] #cfg['completed_tasks_dir'] = self.update.old_cfg['completed_tasks_dir'] cfg['max_task_handlers'] = self.update.old_cfg['max_task_handlers'] cfg['task_pickling_protocol'] = self.update.old_cfg[ 'task_pickling_protocol'] cfg['scheduled_tasks_loop_sleep_time'] = self.update.old_cfg[ 'scheduled_tasks_loop_sleep_time'] cfg['max_fatal_errors_for_task'] = self.update.old_cfg[ 'max_fatal_errors_for_task'] cfg['active_task_default_attempt_repeat'] = repr( self.update.old_cfg['active_task_default_attempt_repeat']) cfg['max_wall_time_for_task_handlers'] = repr( self.update.old_cfg['max_wall_time_for_task_handlers']) cfg['repeat_after_forcible_termination'] = repr( self.update.old_cfg['repeat_after_forcible_termination']) cfg['max_fails_to_submit_to_the_queue'] = self.update.old_cfg[ 'max_fails_to_submit_to_the_queue'] cfg['repeat_after_fails_to_submit_to_the_queue'] = repr( self.update. old_cfg['repeat_after_fails_to_submit_to_the_queue']) cfg['max_time_in_queue'] = repr( self.update.old_cfg['max_time_in_queue']) cfg['export_db_repeat_attempt_in'] = repr( self.update.old_cfg['export_db_repeat_attempt_in']) cfg['export_db_max_repeat_attempts'] = self.update.old_cfg[ 'export_db_max_repeat_attempts'] cfg['default_task_params'] = repr( self.update.old_cfg['default_task_params']) if cfg['repeat_after_forcible_termination'] == cfg[ 'active_task_default_attempt_repeat']: cfg['repeat_after_forcible_termination'] = 'active_task_default_attempt_repeat' akrr_inp = akrr_inp_template.format(**cfg) if not akrr.dry_run: with open(_akrr_cfg, 'w') as f: f.write(akrr_inp) log.info("Configuration is written to: {0}".format(_akrr_cfg)) else: log.dry_run( "New config should be written to: {}".format(_akrr_cfg)) log.debug2(akrr_inp) # reset data_dir completed_tasks_dir as absolute path for further use during setup/update if not os.path.isabs(cfg['data_dir']): cfg['data_dir'] = os.path.abspath( os.path.join(os.path.dirname(_akrr_cfg), cfg['data_dir'])) if not os.path.isabs(cfg['completed_tasks_dir']): cfg['completed_tasks_dir'] = os.path.abspath( os.path.join(os.path.dirname(_akrr_cfg), cfg['completed_tasks_dir'])) return cfg
def analyse_test_job_results(task_id, resource, app_name="test"): """analysing the output""" log.info("Test job is completed analyzing output\n") test_job_lock_filename = get_test_job_lock_filename(resource, app_name) r = akrrrestclient.get('/tasks/%d' % task_id) if r.status_code != 200: log.error( "Can not get information about task\nSee full error report below\nAKRR server response:\n%s\n", r.text) exit(1) completed_tasks = r.json()['data']['data']['completed_tasks'] akrr_xdmod_instance_info = r.json( )['data']['data']['akrr_xdmod_instanceinfo'] akrr_errmsg = r.json()['data']['data'].get('akrr_errmsg', "None") results_summary = make_results_summary(resource['name'], app_name, completed_tasks, akrr_xdmod_instance_info, akrr_errmsg) if completed_tasks['status'].count("ERROR") > 0: # execution was not successful if completed_tasks['status'].count( "ERROR Can not created batch job script and submit it to remote queue" ) > 0: log.error( "Can not created batch job script and/or submit it to remote queue\nSee full error report below\n%s", results_summary) else: log.error("Status: %s\nSee full error report below\n%s", completed_tasks['status'], results_summary) os.remove(test_job_lock_filename) exit(1) if akrr_xdmod_instance_info['status'] == 0: # execution was not successful log.error( "Task execution was not successful\nSee full error report below\n%s", results_summary) os.remove(test_job_lock_filename) exit(1) # see what is in report elm_perf = xml.etree.ElementTree.fromstring( akrr_xdmod_instance_info['body']) elm_parameters = elm_perf.find('benchmark').find('parameters') elm_statistics = elm_perf.find('benchmark').find('statistics') parameters = {'RunEnv:Nodes': '', 'App:ExeBinSignature': ''} statistics = { 'Wall Clock Time': '0.0', 'Network scratch directory exists': '0', 'Network scratch directory accessible': '0', 'App kernel input exists': '0', 'Task working directory accessible': '0', 'local scratch directory accessible': '0', 'local scratch directory exists': '0', 'App kernel executable exists': '0', 'Task working directory exists': '0', 'Shell is BASH': '0' } for elm in list(elm_parameters): variable = elm.findtext('ID') if variable is not None: variable = variable.strip() value = elm.findtext('value') if value is not None: value = value.strip() units = elm.findtext('units') if units is not None: units = units.strip() if variable == 'App:ExeBinSignature' or variable == 'RunEnv:Nodes': value = os.popen('echo "%s"|base64 -d|gzip -d' % (value, )).read() log.debug2("parameter: {} = {} {}".format(variable, value, units)) parameters[variable] = value for elm in list(elm_statistics): variable = elm.findtext('ID') if variable is not None: variable = variable.strip() value = elm.findtext('value') if value is not None: value = value.strip() units = elm.findtext('units') if units is not None: units = units.strip() statistics[variable] = value log.debug2("statistic: {} = {} {}".format(variable, value, units)) files_exists = [ 'Network scratch directory exists', 'App kernel input exists', 'local scratch directory exists', 'App kernel executable exists', 'Task working directory exists' ] dirs_access = [ 'Network scratch directory accessible', 'Task working directory accessible', 'local scratch directory accessible' ] if statistics['Shell is BASH'] == '0': log.error( "Shell on compute nodes of %s is not BASH, change it to bash and try again.\n", resource['name']) log.error_count += 1 for file_exists in files_exists: if statistics[file_exists] == '0': log.error(file_exists.replace('exists', 'does not exist')) log.error_count += 1 for dirAccess in dirs_access: if statistics[dirAccess] == '0': log.error(dirAccess.replace('accessible', 'is not accessible')) log.error_count += 1 if parameters['App:ExeBinSignature'] == '': log.error( "Application signature calculator is not working, you might need to recompile it." "see application output for more hints") log.error_count += 1 if resource['batch_scheduler'].lower() != "openstack": # test the nodes, log to headnode and ping them if parameters['RunEnv:Nodes'] == '': log.error( "Nodes are not detected, check batch_job_template and setup of AKRR_NODELIST variable" ) log.error_count += 1 nodes = parameters['RunEnv:Nodes'].split() requested_nodes = eval(completed_tasks['resource_param'])['nnodes'] str_io = io.StringIO() try: sys.stdout = sys.stderr = str_io rsh = akrr.util.ssh.ssh_resource(resource) number_of_unknown_hosts = 0 for node in set(nodes): log.debug2(node) out = akrr.util.ssh.ssh_command(rsh, "ping -c 1 %s" % node) if out.count("unknown host") > 0: number_of_unknown_hosts += 1 rsh.close(force=True) del rsh sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ if number_of_unknown_hosts > 0: log.error( "ERROR %d: Can not ping compute nodes from head node\n" % (log.error_count + 1) + "Nodes on which test job was executed detected as " + parameters['RunEnv:Nodes'] + "\n" + "If these names does not have sense check batch_job_template and setup of AKRR_NODELIST " "variable in resource configuration file") log.error_count += 1 except Exception as e: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ log.critical( "Can not connect to %s\nProbably invalid credential, see full error report:\n%s", resource['name'], str_io.getvalue()) raise e # check ppn count if requested_nodes * resource['ppn'] != len(nodes): log.error( "ERROR {}: Number of requested processes (processes per node * nodes) " "do not match actual processes executed" "Either\n" " AKRR_NODELIST variable is set incorrectly\n" "Or\n" " processes per node (PPN) is wrong\n".format( log.error_count + 1)) log.error_count += 1 log.info("\nTest kernel execution summary:\n%s", results_summary) log.info("\nThe output looks good.\n")
def _add_fake_modw(): log.info("Creating minimal modw database required for AKRR functioning if needed") create_db = True create_table = True populate_table = True import MySQLdb from .db import get_xd_db con, cur = get_xd_db(su=True) from akrr.util.sql import db_exist if db_exist(cur, "modw"): create_db = False log.info("modw exists") try: cur.execute("SELECT * FROM modw.resourcefact") cur.fetchall() create_table = False log.info("modw.resourcefact exists") cur.execute("SELECT * FROM modw.resourcefact WHERE code='Alpha' OR code='Bravo'") rs = cur.fetchall() if len(rs) == 2: populate_table = False log.info("modw.resourcefact contains Alpha and Bravo") except MySQLdb.Error: log.debug2("Either modw.resourcefact does not exist or unexpected values") if create_db: _cursor_execute(cur, "CREATE DATABASE IF NOT EXISTS modw") if create_table: _cursor_execute(cur, """ USE modw; CREATE TABLE `resourcefact` ( `id` INT NOT NULL, `resourcetype_id` INT, `organization_id` INT, `name` VARCHAR(200), `code` VARCHAR(64) NOT NULL, `description` VARCHAR(1000), `start_date` DATETIME, `start_date_ts` INT DEFAULT 0 NOT NULL, `end_date` DATETIME, `end_date_ts` INT, PRIMARY KEY (`id`, `start_date_ts`) ); CREATE INDEX `aggregation_index` ON `resourcefact` (`resourcetype_id`, `id`); """) if populate_table: _cursor_execute( cur, "INSERT INTO modw.resourcefact (" + "id, resourcetype_id, organization_id, name, code, description, " + " start_date, start_date_ts, end_date, end_date_ts) " + "VALUES (10, 1, 35, 'alpha', 'Alpha', null, " + " '2010-01-01 00:00:00.0', 1262322000, null, null);") _cursor_execute( cur, "INSERT INTO modw.resourcefact (" + "id, resourcetype_id, organization_id, name, code, description," + " start_date, start_date_ts, end_date, end_date_ts) " + "VALUES (11, 1, 35, 'bravo', 'Bravo', null, " + " '2010-01-01 00:00:00.0', 1262322000, null, null); ") con.commit() cur.close() con.close()
def generate_settings_file(self): """ Generate configuration (akrr.conf) file """ log.info("Generating configuration file ...") with open(os.path.join(akrr_mod_dir, 'templates', 'akrr.conf'), 'r') as f: akrr_inp_template = f.read() restapi_rw_password = self.get_random_password() restapi_ro_password = self.get_random_password() var = { 'akrr_db_host': '"%s"' % self.akrr_db_host, 'akrr_db_port': '%s' % str(self.akrr_db_port), 'akrr_db_user_name': '"%s"' % self.akrr_db_user_name, 'akrr_db_user_password': '******' % self.akrr_db_user_password, 'akrr_db_name': '"%s"' % self.akrr_db_name, 'ak_db_name': '"%s"' % self.ak_db_name, 'xd_db_name': '"%s"' % self.xd_db_name, 'restapi_rw_password': restapi_rw_password, 'restapi_ro_password': restapi_ro_password } if self.akrr_db_host == self.ak_db_host and self.akrr_db_port == self.ak_db_port and \ self.akrr_db_user_name == self.ak_db_user_name and \ self.akrr_db_user_password == self.ak_db_user_password: var.update({ 'ak_db_host': 'akrr_db_host', 'ak_db_port': 'akrr_db_port', 'ak_db_user_name': 'akrr_db_user', 'ak_db_user_password': '******' }) else: var.update({ 'ak_db_host': '"%s"' % self.ak_db_host, 'ak_db_port': '%s' % str(self.ak_db_port), 'ak_db_user_name': '"%s"' % self.ak_db_user_name, 'ak_db_user_password': '******' % self.ak_db_user_password }) if self.xd_db_host == self.akrr_db_host and self.xd_db_port == self.akrr_db_port and \ self.xd_db_user_name == self.akrr_db_user_name and \ self.xd_db_user_password == self.akrr_db_user_password: var.update({ 'xd_db_host': 'akrr_db_host', 'xd_db_port': 'akrr_db_port', 'xd_db_user_name': 'akrr_db_user', 'xd_db_user_password': '******', }) elif self.xd_db_host == self.ak_db_host and self.xd_db_port == self.ak_db_port and \ self.xd_db_user_name == self.ak_db_user_name and \ self.xd_db_user_password == self.ak_db_user_password: var.update({ 'xd_db_host': 'ak_db_host', 'xd_db_port': 'ak_db_port', 'xd_db_user_name': 'ak_db_user', 'xd_db_user_password': '******', }) else: var.update({ 'xd_db_host': '"%s"' % self.xd_db_host, 'xd_db_port': '%s' % str(self.xd_db_port), 'xd_db_user_name': '"%s"' % self.xd_db_user_name, 'xd_db_user_password': '******' % self.xd_db_user_password, }) akrr_inp = akrr_inp_template.format(**var) if not dry_run: with open(akrr_cfg, 'w') as f: f.write(akrr_inp) log.info("Configuration is written to: {0}".format(akrr_cfg)) else: log.dry_run("New config should be written to: {}".format(akrr_cfg)) log.debug2(akrr_inp)