def db_check(mod_akrr=True, mod_appkernel=True, modw=True): overall_success = True if mod_akrr: akrr_ok = check_rw_db( akrr.db.get_akrr_db, "Checking 'mod_akrr' Database / User privileges...", "'mod_akrr' Database check complete - Status: %s") overall_success = overall_success and akrr_ok if mod_appkernel: app_kernel_ok = check_rw_db( akrr.db.get_ak_db, "Checking 'mod_appkernel' Database / User privileges...", "'mod_appkernel' Database check complete - Status: %s") overall_success = overall_success and app_kernel_ok if modw: xdmod_ok = check_r_db(akrr.db.get_xd_db, "Checking 'modw' Database / User privileges...", "'modw' Database check complete - Status: %s") overall_success = overall_success and xdmod_ok # DETERMINE: whether or not everything passed. if overall_success: log.info("All Databases / User privileges check out!") return True else: log.error( "One or more of the required databases and their required users ran into a problem. " "Please take note of the previous messages, correct the issue and re-run this script." ) return False
def check_dir(sh, d, exit_on_fail=False, try_to_create=True, raise_on_fail=False): """ check that directory exists and verify its accessibility return None,message if does not exists return True,message if can write there return False,message if can not write there """ status, msg = check_dir_simple(sh, d) if try_to_create is True and status is None: log.info("Directory %s:%s does not exists, will try to create it", sh.remote_machine, d) if not akrr.dry_run: cmd = "mkdir -p \"%s\"" % (d,) ssh_command(sh, cmd) status, msg = check_dir_simple(sh, d) else: status, msg = (True, "Directory exist and accessible for read/write") if exit_on_fail is False: return status, msg if status is None: log.error("Directory %s:%s does not exists!", sh.remote_machine, d) if raise_on_fail: raise AkrrError("Directory %s:%s does not exists!" % (sh.remote_machine, d)) else: exit() elif status is True: return True, msg else: log.error("Directory %s:%s is NOT accessible for read/write!", sh.remote_machine, d) if raise_on_fail: raise AkrrError("Directory %s:%s is NOT accessible for read/write!" % (sh.remote_machine, d)) else: exit()
def daemon_stop(pid: int = None, timeout: float = 120.0): """ Stop AKRR server. Return True on success or False on timeout """ import os import time import psutil import signal if pid is None: log.info("AKRR is already not running.") return True log.info("Sending termination signal to AKRR server (PID: " + str(pid) + ")") # send a signal to terminate os.kill(pid, signal.SIGTERM) # wait till process will finished start = time.time() while psutil.pid_exists(pid): time.sleep(0.5) if time.time() - start > timeout: log.error("Can not stopped daemon!") return False log.info("Stopped AKRR server (PID: " + str(pid) + ")") return True
def _set_env(self): # get location of openstack_env_set_script if self._which_env_set_script is None: if os.path.isfile(self._env_set_script): self._which_env_set_script = self._env_set_script else: self._which_env_set_script = self.run_cmd("which " + self._env_set_script) self._which_env_set_script = self._which_env_set_script.strip() if self._which_env_set_script.endswith(self._env_set_script): self._which_env_set_script = os.path.expanduser(self._which_env_set_script) log.debug("which_openstack_env_set_script: " + self._which_env_set_script) # check presence of openstack_env_set_script if not os.path.isfile(self._which_env_set_script): msg = "Can not find openstack environment setup script: " + self._env_set_script msg += "\n" + self._which_env_set_script log.error(msg) raise FileNotFoundError(msg) # set environment while True: out = self.run_cmd("source " + self._which_env_set_script) if out.count("HTTP Client Error (HTTP 429)") > 0: time.sleep(30) else: break self._token = self.run_cmd("echo $OS_TOKEN").strip()
def load_app(app_name: str, resources: Dict, app_cfg_filename: str = None, validate=True) -> Dict: """ load app configuration file, do minimalistic validation return dict with app parameters raises error if can not load """ log.debug("Loading app %s", app_name) app = load_app_default(app_name) # load resource specific parameters for resource_name in os.listdir(os.path.join(cfg_dir, "resources")): if resource_name in ['notactive', 'templates']: continue app_on_resource_cfg_filename = os.path.join(cfg_dir, "resources", resource_name, app_name + ".app.conf") if not os.path.isfile(app_on_resource_cfg_filename): continue try: app['appkernel_on_resource'][resource_name] = load_app_on_resource( app_name, resource_name, resources[resource_name], app) except Exception: log.error( "Exception occurred during app kernel configuration loading for %s from %s." % (app_name, app_on_resource_cfg_filename) + "Will skip it for now.") raise AkrrError("Can not load app configuration for %s." % app_name) app = verify_app_params(app, app) return app
def _set_openstack_env(self): # get location of openstack_env_set_script if self._which_openstack_env_set_script is None: if os.path.isfile(self._openstack_env_set_script): self._which_openstack_env_set_script = self._openstack_env_set_script else: self._which_openstack_env_set_script = self._shell.run_command( "which " + self._openstack_env_set_script) self._which_openstack_env_set_script = self._which_openstack_env_set_script.strip( ) if self._which_openstack_env_set_script.endswith( self._openstack_env_set_script): self._which_openstack_env_set_script = os.path.expanduser( self._which_openstack_env_set_script) log.debug("which_openstack_env_set_script: " + self._which_openstack_env_set_script) # check presence of openstack_env_set_script if not os.path.isfile(self._which_openstack_env_set_script): msg = "Can not find openstack environment setup script: " + self._openstack_env_set_script msg += "\n" + self._which_openstack_env_set_script log.error(msg) raise FileNotFoundError(msg) # set environment self.run_cmd("source " + self._which_openstack_env_set_script) self._token = self.run_cmd("echo $OS_TOKEN").strip()
def connect_to_resource(resource): """connect to resource defined in resource dictionary""" log.info("Validating resource accessibility. Connecting to %s.", resource['name']) if resource['ssh_private_key_file'] is not None and os.path.isfile( resource['ssh_private_key_file']) is False: log.error("Can not access ssh private key (%s)" "", resource['ssh_private_key_file']) exit(1) str_io = io.StringIO() try: sys.stdout = sys.stderr = str_io rsh = akrr.util.ssh.ssh_resource(resource) sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ log.info("Successfully connected to %s\n", resource['name']) log.empty_line() return rsh except AkrrError: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ log.critical("Can not connect to %s\nMessage:\n%s", resource['name'], str_io.getvalue()) exit(1)
def init_dir(): """ Make directories for configuration and logging """ try: log.info("Creating directories structure.") if not os.path.isdir(_akrr_home): make_dirs(_akrr_home) if not os.path.isdir(os.path.join(_akrr_home, 'etc')): make_dirs(os.path.join(_akrr_home, 'etc')) if not os.path.isdir(os.path.join(_akrr_home, 'etc', 'resources')): make_dirs(os.path.join(_akrr_home, 'etc', 'resources')) if not os.path.isdir(os.path.join(_akrr_home, 'etc', 'resources')): make_dirs(os.path.join(_akrr_home, 'etc', 'resources')) if not os.path.isdir(os.path.join(_akrr_home, 'log')): make_dirs(os.path.join(_akrr_home, 'log')) if not os.path.isdir(os.path.join(_akrr_home, 'log', 'data')): make_dirs(os.path.join(_akrr_home, 'log', 'data')) if not os.path.isdir(os.path.join(_akrr_home, 'log', 'comptasks')): make_dirs(os.path.join(_akrr_home, 'log', 'comptasks')) if not os.path.isdir(os.path.join(_akrr_home, 'log', 'akrrd')): make_dirs(os.path.join(_akrr_home, 'log', 'akrrd')) except Exception as e: log.error("Can not create directories: " + str(e)) exit(1)
def _read_username_password(prompt="Enter username:"******"user", password_on_default_user=None): if username is None: log.log_input(prompt) if username is None: username = input('[{0}]: '.format(default_username)) if username == '': username = default_username else: log.info("User, " + username + ", already entered.") if username == default_username and password is None and password_on_default_user is not None: password = password_on_default_user if password is None: while True: log.log_input("Please specify a password:"******"Please reenter the password:"******"Entered passwords do not match. Please try again.") else: log.info("Password already entered.") return username, password
def get_system_characteristics(): """detect system characteristics or ask user about them""" global ppn while True: try: log.log_input("Enter processors (cores) per node count:") ppn = int(input("")) break except (ValueError, TypeError): log.error("Incorrect entry, try again.")
def check_shell(rsh, resource): log.info("Checking if shell is BASH\n") msg = akrr.util.ssh.ssh_command(rsh, "echo $BASH") if msg.count("bash") > 0: log.info("Shell is BASH\n") else: log.error( "Shell on headnode of %s is not BASH, change it to bash and try again.\n", resource['name']) exit(1)
def is_api_up(): from akrr import akrrrestclient request = akrrrestclient.get("/scheduled_tasks") if request.status_code == 200: return True else: log.error( 'Unable to successfully contact the REST API: %s: %s', request.status_code, request.text) return False
def run(self): """execute what asked in command line""" log.info("AKRR Regression Tests") cli_args = self.root_parser.parse_args() self.process_common_args(cli_args) if hasattr(cli_args, "func"): cli_args.func(cli_args) else: log.error("There is no command specified!")
def archive_tasks_by_months(self, months_old: int, resources=None, appkernels=None) -> None: """ archive old task by months """ log.info("Archiving tasks by months") resources = get_list_from_comma_sep_values(resources) appkernels = get_list_from_comma_sep_values(appkernels) time_now = datetime.datetime.now() mega_month_now = time_now.month + time_now.year * 12 count = 0 task_month_dirs = self.get_tasks_month_dir_list(resources, appkernels) n_task_month_dirs = len(task_month_dirs) progress_update = max(int(round(n_task_month_dirs / 50)), 1) for i in range(n_task_month_dirs): task_month_dir = task_month_dirs[i] if i % progress_update == 0: progress_bar(i / n_task_month_dirs) month = os.path.basename(task_month_dir) year = os.path.basename(os.path.dirname(task_month_dir)) mega_month = int(month) + int(year) * 12 if mega_month_now - mega_month < months_old: continue # unzip archives for archive in [ f for f in os.listdir(task_month_dir) if f.endswith(".tar.gz") ]: archive_path = os.path.join(task_month_dir, archive) try: targz = tarfile.open(archive_path, "r|gz") targz.extractall(task_month_dir) targz.close() os.remove(archive_path) except Exception as e: log.error("Can not extract %s:\n %s", archive_path, str(e)) # zip month try: targz = tarfile.open(task_month_dir + '.tar.gz', "w|gz") targz.add(task_month_dir, month) targz.close() shutil.rmtree(task_month_dir) count += 1 except Exception as e: log.error("Can not archive %s\n %s", task_month_dir, str(e)) progress_bar() log.info("Archived %d task months" % count)
def run(self, args=None): """parse arguments and execute requested commands""" # PARSE: the command line parameters the user provided. cli_args = self.root_parser.parse_args(args=args) self.process_common_args(cli_args) # EXECUTE: the function provided in the '.set_defaults(func=...)' if hasattr(cli_args, "func"): return cli_args.func(cli_args) log.error("There is no command specified!") return None
def write_error_xml(self, filename, cdata=False): content = ("<body>\n" "<xdtas>\n" " <batchJob>\n" " <status>Error</status>\n" " <errorCause>%s</errorCause>\n" " <reporter>%s</reporter>\n" " <errorMsg>%s</errorMsg>\n" " </batchJob>\n" " </xdtas>\n" "</body> \n") % (self.status, self.appName, self.status_info) if cdata: content = ("<body>\n" " <xdtas>\n" " <batchJob>\n" " <status>Error</status>\n" " <errorCause>%s</errorCause>\n" " <reporter>%s</reporter>\n" " <errorMsg><![CDATA[%s]]></errorMsg>\n" " </batchJob>\n" " </xdtas>\n" "</body>\n") % (self.status, self.appName, self.status_info) # now lets try to read to parce it import xml.etree.ElementTree try: xml.etree.ElementTree.fromstring(content) except Exception as e: log.error( "Cannot write readable XML file (%s), will try CDATA declaration" % str(e)) content = ("<body>\n" " <xdtas>\n" " <batchJob>\n" " <status>Error</status>\n" " <errorCause>%s</errorCause>\n" " <reporter>%s</reporter>\n" " <errorMsg><![CDATA[%s]]></errorMsg>\n" " </batchJob>\n" " </xdtas>\n" "</body>\n") % (self.status, self.appName, self.status_info) try: xml.etree.ElementTree.fromstring(content) except Exception as e2: log.error("Cannot write readable XML file!!! %s" % str(e2)) fout = open(filename, "w") fout.write(content) fout.close()
def check_connection_to_rest_api(): # get check connection try: r = akrrrestclient.get('/scheduled_tasks') if r.status_code != 200: log.error( "Can not get token for AKRR REST API ( %s )\nSee server response below\n%s", akrrrestclient.restapi_host, json.dumps(r.json(), indent=4)) exit(1) except Exception as e: log.critical( "Can not connect to AKRR REST API ( %s )\nIs it running?\nSee full error report below", akrrrestclient.restapi_host) raise e
def validate_resource_name(m_resource_name: str) -> bool: if m_resource_name.strip() == "": log.error("Bad name for resource, try a different name") return False # check config file presence file_path = os.path.abspath(os.path.join(resources_dir, m_resource_name)) if os.path.exists(file_path): log.error( "Resource configuration directory (%s) for resource with name %s " "already present on file system, try a different name" % (file_path, m_resource_name,)) return False # check the entry in mod_appkernel con_ak, cur_ak = akrr.db.get_ak_db(True) cur_ak.execute('''SELECT * FROM resource WHERE nickname=%s''', (m_resource_name,)) resource_in_ak_db = cur_ak.fetchall() if len(resource_in_ak_db) != 0: log.error( "Resource with name %s already present in mod_appkernel DB, try a different name" % (m_resource_name,)) return False # check the entry in mod_akrr db, cur = akrr.db.get_akrr_db(True) cur.execute('''SELECT * FROM resources WHERE name=%s''', (m_resource_name,)) resource_in_db = cur.fetchall() if len(resource_in_db) != 0: log.error("Resource with name %s already present in mod_akrr DB, try a different name" % (m_resource_name,)) return False return True
def reprocess_parsed(args): if not (args.resource and args.appkernel): log.error('Please provide a resource, app') exit(1) resource = args.resource appkernel = args.appkernel time_start = args.time_start time_end = args.time_end verbose = args.verbose from akrr import daemon sch = daemon.AkrrDaemon(adding_new_tasks=True) sch.reprocess_completed_tasks(resource, appkernel, time_start, time_end, verbose)
def read_old_akrr_conf_dir(self, old_akrr_conf_dir): """Read old AKRR configuration file""" if not os.path.isdir(old_akrr_conf_dir): log.error("Directory with old AKRR configuration do not exist: " + old_akrr_conf_dir) exit(1) old_akrr_conf_file = os.path.join(old_akrr_conf_dir, "akrr.conf") if not os.path.isfile(old_akrr_conf_file): log.error("File with old AKRR configuration do not exist: " + old_akrr_conf_file) exit(1) from akrr.util import exec_files_to_dict log.info("Reading old AKRR configuration from: " + old_akrr_conf_file) self.old_akrr_conf = exec_files_to_dict(old_akrr_conf_file)
def check_utils(): """ check that ssh and openssl already installed """ from distutils.spawn import find_executable errmsg = "" if not find_executable('ssh'): errmsg += "Can not find ssh in PATH, please install it.\n" if not find_executable('openssl'): errmsg += "Can not find openssl in PATH, please install it.\n" if errmsg != "": log.error(errmsg) exit(1)
def check_appsig(rsh, resource): log.info("Testing app.signature calculator on headnode\n") out = akrr.util.ssh.ssh_command( rsh, "%s/execs/bin/appsigcheck.sh `which md5sum`" % (resource['appkernel_dir'], )) if out.count("===ExeBinSignature===") > 0 and out.count("MD5:") > 0: log.info("App.signature calculator is working on headnode\n") else: if akrr.dry_run: log.dry_run("App.signature calculator is not working\n") return log.error( "App.signature calculator is not working\n" + "See full error report below\n%s", out) exit(1)
def validate_resource_parameter_file(resource_name): """validate resource parameter file and return dictionary with resource configuration""" # @todo reuse cfg.verify_resource_params default_resource_param_filename = os.path.join(cfg.akrr_mod_dir, "default_conf", "default.resource.conf") resource_param_filename = os.path.join(cfg.cfg_dir, "resources", resource_name, "resource.conf") log.info("Validating %s parameters from %s", resource_name, resource_param_filename) if not os.path.isfile(resource_param_filename): log.error("resource parameters file (%s) does not exist!", resource_param_filename) exit(1) # check syntax try: tmp = {} exec( compile( open(default_resource_param_filename).read(), default_resource_param_filename, 'exec'), tmp) exec( compile( open(resource_param_filename).read(), resource_param_filename, 'exec'), tmp) except Exception as e: log.critical( "Can not load resource from %s.\nProbably invalid syntax.", resource_param_filename) raise e resource = None try: # now we can load akrr, parameters checking did h resource = cfg.find_resource_by_name(resource_name) except Exception as e: log.error("Can not load resource config from %s!\n%s\n%s", resource_param_filename, str(e), traceback.format_exc()) exit(1) log.info( "Syntax of %s is correct and all necessary parameters are present.", resource_param_filename) log.empty_line() return resource
def init_mysql_dbs(self): """ Create AKRR database and access user, set the user access rights """ try: def _create_db_user_gran_priv_if_needed(con_fun, user, password, db, priv, create): """ Helping function to create db and user """ if create: log.info("Creating %s and user to access it" % (db,)) else: log.info("Setting user to access %s" % (db,)) su_con, su_cur = con_fun(True, None) client_host = get_db_client_host(su_cur) if create: _cursor_execute(su_cur, "CREATE DATABASE IF NOT EXISTS %s" % (cv(db),)) create_user_if_not_exists(su_cur, user, password, client_host, dry_run=dry_run) _cursor_execute(su_cur, "GRANT " + cv(priv) + " ON " + cv(db) + ".* TO %s@%s", (user, client_host)) su_con.commit() # During self.read_db_creds db and user was checked and # if they do not exist or not good enough super user credentials # was asked so if they not None that means that # either user or db or user priv needed to be set if self.akrr_db_su_user_name is not None: _create_db_user_gran_priv_if_needed( self.get_akrr_db, self.akrr_db_user_name, self.akrr_db_user_password, self.akrr_db_name, "ALL", True) if not self.stand_alone: if self.ak_db_su_user_name is not None: _create_db_user_gran_priv_if_needed( self.get_ak_db, self.ak_db_user_name, self.ak_db_user_password, self.ak_db_name, "ALL", True) if self.xd_db_su_user_name is not None: _create_db_user_gran_priv_if_needed( self.get_xd_db, self.xd_db_user_name, self.xd_db_user_password, self.xd_db_name, "SELECT", False) except Exception as e: import traceback traceback.print_exc() log.error("Can not execute the sql setup script: " + str(e)) exit(1)
def _read_sql_su_credentials(host, port): while True: log.log_input( "Please provide an administrative database user (for {}:{}) " "under which the installation sql script should " "run (This user must have privileges to create " "users and databases).".format(host, port)) su_username = input("Username: "******"Please provide the password for the the user which you previously entered:") su_password = getpass.getpass() try: get_con_to_db(su_username, su_password, host, port) return su_username, su_password except Exception as e: log.error("MySQL error: " + str(e)) log.error("Entered credential is not valid. Please try again.")
def check_previous_installation(self): """ check that AKRR is not already installed """ if os.path.exists(_akrr_cfg): if self.update: return else: msg = "This is a fresh installation script. " + _akrr_home + \ " contains previous AKRR installation. Either uninstall it or see documentation on updates.\n\n" msg += "To uninstall AKRR manually:\n\t1)remove " + _akrr_cfg + "\n\t\trm " + _akrr_cfg + "\n" msg += "\t2) (optionally for totally fresh start) drop mod_akrr and mod_appkernel database\n" msg += "\t\tDROP DATABASE mod_appkernel;\n" msg += "\t\tDROP DATABASE mod_akrr;\n\n" log.error(msg) exit(1)
def remove_tasks_state_dumps(self, days_old: int, resources=None, appkernels=None) -> None: """ remove tasks state dumps """ resources = get_list_from_comma_sep_values(resources) appkernels = get_list_from_comma_sep_values(appkernels) log.info("Removing tasks state dumps") log.debug("resources filter: " + str(resources)) log.debug("appkernels filter: " + str(appkernels)) log.debug("days: " + str(days_old)) log.debug("dry_run: " + str(self.dry_run)) log.debug("comp_task_dir: " + str(self.comp_task_dir)) timenow = datetime.datetime.now() seconds_in_day = 24 * 3600 count = 0 for task_dir in self.get_tasks_dir_list(resources, appkernels): try: time_stamp = os.path.basename(task_dir) activate_time = time_stamp_to_datetime(time_stamp) days_passed = (timenow - activate_time).total_seconds() / seconds_in_day if days_passed < days_old: continue proc_dir = os.path.join(task_dir, "proc") if not os.path.isdir(proc_dir): continue for state_file in os.listdir(proc_dir): if _state_dump.match(state_file) is None: continue log.debug2(" delete:", state_file) state_file_fullpath = os.path.join(proc_dir, state_file) count += 1 if not self.dry_run: os.remove(state_file_fullpath) except: log.error("Cannot process: " + task_dir) log.info("Removed %d task state dumps" % count)
def archive_tasks(self, days_old: int, resources=None, appkernels=None) -> None: """ archive old task """ resources = get_list_from_comma_sep_values(resources) appkernels = get_list_from_comma_sep_values(appkernels) log.info("Archiving tasks") log.debug("resources filter: " + str(resources)) log.debug("appkernels filter: " + str(appkernels)) log.debug("days: " + str(days_old)) log.debug("dry_run: " + str(self.dry_run)) log.debug("comp_task_dir: " + str(self.comp_task_dir)) time_now = datetime.datetime.now() seconds_in_day = 24 * 3600 count = 0 task_dirs = self.get_tasks_dir_list(resources, appkernels) n_task_dirs = len(task_dirs) progress_update = max(int(round(n_task_dirs / 50)), 1) for i in range(n_task_dirs): task_dir = task_dirs[i] if i % progress_update == 0: progress_bar(i / n_task_dirs) try: time_stamp = os.path.basename(task_dir) activate_time = time_stamp_to_datetime(time_stamp) days_passed = (time_now - activate_time).total_seconds() / seconds_in_day if days_passed < days_old: continue out = tarfile.open(task_dir + '.tar.gz', mode='w|gz') out.add(task_dir, time_stamp) out.close() shutil.rmtree(task_dir) count += 1 except: log.error("Cannot process: " + task_dir) progress_bar() log.info("Archived %d tasks" % count)
def remove_tasks_workdir(self, days_old: int, resources=None, appkernels=None) -> None: """ remove tasks state dumps """ resources = get_list_from_comma_sep_values(resources) appkernels = get_list_from_comma_sep_values(appkernels) log.info("Removing tasks workdir") log.debug("resources filter: " + str(resources)) log.debug("appkernels filter: " + str(appkernels)) log.debug("days: " + str(days_old)) log.debug("dry_run: " + str(self.dry_run)) log.debug("comp_task_dir: " + str(self.comp_task_dir)) timenow = datetime.datetime.now() seconds_in_day = 24 * 3600 count = 0 for task_dir in self.get_tasks_dir_list(resources, appkernels): try: time_stamp = os.path.basename(task_dir) activate_time = time_stamp_to_datetime(time_stamp) days_passed = (timenow - activate_time).total_seconds() / seconds_in_day if days_passed < days_old: continue workdir_dir = os.path.join(task_dir, "jobfiles", "workdir") if not os.path.isdir(workdir_dir): continue if log.verbose: print("Found workdir:", workdir_dir) count += 1 if not self.dry_run: shutil.rmtree(workdir_dir) except: log.error("Cannot process: " + task_dir) log.info("Removed %d task workdirs" % count)
def handler(args): from akrr.util import log from akrr.daemon import get_daemon_pid, daemon_start, daemon_stop if args.cron is True: run_akrr_for_cron() else: log.info("Archiving old completed tasks") from akrr.archive import Archive if args.pickle_days > args.archive_days: log.error( "pickle_days should be less or equal to archive_days") exit(1) if args.archive_months < 1: log.error("archive_months should be at least 1") exit(1) Archive().remove_tasks_state_dumps(days_old=args.pickle_days) Archive().archive_tasks(days_old=args.archive_days) Archive().archive_tasks_by_months(months_old=args.archive_months)