def _make_dirs(path): """Recursively create directories if not in dry run mode""" if not dry_run: log.debug("Creating directory: {}".format(path)) os.makedirs(path) else: log.dry_run("_make_dirs({})".format(path))
def _set_openstack_env(self): # get location of openstack_env_set_script if self._which_openstack_env_set_script is None: if os.path.isfile(self._openstack_env_set_script): self._which_openstack_env_set_script = self._openstack_env_set_script else: self._which_openstack_env_set_script = self._shell.run_command( "which " + self._openstack_env_set_script) self._which_openstack_env_set_script = self._which_openstack_env_set_script.strip( ) if self._which_openstack_env_set_script.endswith( self._openstack_env_set_script): self._which_openstack_env_set_script = os.path.expanduser( self._which_openstack_env_set_script) log.debug("which_openstack_env_set_script: " + self._which_openstack_env_set_script) # check presence of openstack_env_set_script if not os.path.isfile(self._which_openstack_env_set_script): msg = "Can not find openstack environment setup script: " + self._openstack_env_set_script msg += "\n" + self._which_openstack_env_set_script log.error(msg) raise FileNotFoundError(msg) # set environment self.run_cmd("source " + self._which_openstack_env_set_script) self._token = self.run_cmd("echo $OS_TOKEN").strip()
def load_app(app_name: str, resources: Dict, app_cfg_filename: str = None, validate=True) -> Dict: """ load app configuration file, do minimalistic validation return dict with app parameters raises error if can not load """ log.debug("Loading app %s", app_name) app = load_app_default(app_name) # load resource specific parameters for resource_name in os.listdir(os.path.join(cfg_dir, "resources")): if resource_name in ['notactive', 'templates']: continue app_on_resource_cfg_filename = os.path.join(cfg_dir, "resources", resource_name, app_name + ".app.conf") if not os.path.isfile(app_on_resource_cfg_filename): continue try: app['appkernel_on_resource'][resource_name] = load_app_on_resource( app_name, resource_name, resources[resource_name], app) except Exception: log.error( "Exception occurred during app kernel configuration loading for %s from %s." % (app_name, app_on_resource_cfg_filename) + "Will skip it for now.") raise AkrrError("Can not load app configuration for %s." % app_name) app = verify_app_params(app, app) return app
def generate_self_signed_certificate(): """ Generate self signed certificate for AKRR Rest API """ log.info("Generating self-signed certificate for REST-API") cmd = """ openssl req \ -new \ -newkey rsa:4096 \ -days 3650 \ -nodes \ -x509 \ -subj "/C=US/ST=Denial/L=Springfield/O=Dis/CN=localhost" \ -keyout {akrr_cfg_dir}/server.key \ -out {akrr_cfg_dir}/server.cert cp {akrr_cfg_dir}/server.key {akrr_cfg_dir}/server.pem cat {akrr_cfg_dir}/server.cert >> {akrr_cfg_dir}/server.pem """.format(akrr_cfg_dir=os.path.join(_akrr_home, 'etc')) if not akrr.dry_run: output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) log.debug(output.decode("utf-8")) log.info(" new self-signed certificate have been generated") else: log.dry_run("run command: " + cmd)
def _set_env(self): # get location of openstack_env_set_script if self._which_env_set_script is None: if os.path.isfile(self._env_set_script): self._which_env_set_script = self._env_set_script else: self._which_env_set_script = self.run_cmd("which " + self._env_set_script) self._which_env_set_script = self._which_env_set_script.strip() if self._which_env_set_script.endswith(self._env_set_script): self._which_env_set_script = os.path.expanduser(self._which_env_set_script) log.debug("which_openstack_env_set_script: " + self._which_env_set_script) # check presence of openstack_env_set_script if not os.path.isfile(self._which_env_set_script): msg = "Can not find openstack environment setup script: " + self._env_set_script msg += "\n" + self._which_env_set_script log.error(msg) raise FileNotFoundError(msg) # set environment while True: out = self.run_cmd("source " + self._which_env_set_script) if out.count("HTTP Client Error (HTTP 429)") > 0: time.sleep(30) else: break self._token = self.run_cmd("echo $OS_TOKEN").strip()
def update_bashrc(): """Add AKRR enviroment variables to .bashrc""" log.info("Updating .bashrc") bash_content_new = [] akrr_header = '#AKRR Server Environment Variables' if os.path.exists(os.path.expanduser("~/.bashrc")): log.info("Updating AKRR record in $HOME/.bashrc, backing to $HOME/.bashrc_akrrbak") if not dry_run: subprocess.call("cp ~/.bashrc ~/.bashrcakrr", shell=True) with open(os.path.expanduser('~/.bashrc'), 'r') as f: bashcontent = f.readlines() in_akrr = False for l in bashcontent: if l.count(akrr_header + ' [Start]') > 0: in_akrr = True if not in_akrr: bash_content_new.append(l) if l.count(akrr_header + ' [End]') > 0: in_akrr = False bash_content_new.append("\n" + akrr_header + " [Start]\n") bash_content_new.append("export PATH=\"{0}/bin:$PATH\"\n".format(akrr_home)) bash_content_new.append(akrr_header + " [End]\n\n") if not dry_run: with open(os.path.expanduser('~/.bashrc'), 'w') as f: for l in bash_content_new: f.write(l) log.info("Appended AKRR records to $HOME/.bashrc") else: log.debug("New .bashrc should be like" + "\n".join(bash_content_new))
def update_bashrc(self): """Add AKRR enviroment variables to .bashrc""" log.info("Updating .bashrc") akrr_header = '#AKRR Server Environment Variables' akrr_bash_content_new = list() akrr_bash_content_new.append("\n" + akrr_header + " [Start]\n") if _in_src_install: akrr_bash_content_new.append( "export PATH=\"{0}:$PATH\"\n".format(_akrr_bin_dir)) if akrr.get_akrr_dirs( self.akrr_home_dir )['akrr_home_type'] == akrr.AKRRHomeType.in_env_path: # i.e. non standard AKRR home location akrr_bash_content_new.append( "export AKRR_HOME=\"{0}\"\n".format(_akrr_home)) akrr_bash_content_new.append(akrr_header + " [End]\n\n") if len(akrr_bash_content_new) > 2: if os.path.exists(os.path.expanduser("~/.bashrc")): log.info( "Updating AKRR record in $HOME/.bashrc, backing to $HOME/.bashrc.akrr_back" ) if not akrr.dry_run: subprocess.call("cp ~/.bashrc ~/.bashrc.akrr_back", shell=True) bash_content_new = [] with open(os.path.expanduser('~/.bashrc'), 'r') as f: bashcontent = f.readlines() in_akrr = False akrr_added = False for line in bashcontent: if line.count(akrr_header + ' [Start]') > 0: in_akrr = True if not akrr_added: bash_content_new += akrr_bash_content_new akrr_added = True if not in_akrr: bash_content_new.append(line) if line.count(akrr_header + ' [End]') > 0: in_akrr = False if not akrr_added: bash_content_new += akrr_bash_content_new else: bash_content_new = akrr_bash_content_new if not akrr.dry_run: with open(os.path.expanduser('~/.bashrc'), 'w') as f: for line in bash_content_new: f.write(line) log.info("Appended AKRR records to $HOME/.bashrc") else: log.debug("New .bashrc should be like:\n" + "".join(bash_content_new)) else: log.info( "AKRR is in standard location, no updates to $HOME/.bashrc")
def run_cloud_cmd(self, cmd): out = self.run_cmd("openstack "+cmd) if out.count("Failed to validate token"): self._set_env() out = self.run_cmd("openstack "+cmd) log.debug(cmd+"\n"+out) if out.count("Failed to validate token"): raise Exception("Can not execute openstack command!\n"+out) return out
def check_rw_db(connection_func, pre_msg, post_msg): """ Check that the user has the correct privileges to the database at the end of the connection provided by 'connection_func'. Specifically, checking for read / write permissions ( and create table ). :type connection_func function :type pre_msg str :type post_msg str :param connection_func: the function that will provide a (connection, cursor) tuple. :param pre_msg: a message to be provided to the user before the checks begin. :param post_msg: a message to be provided to the user after the checks are successful :return: true if the database is available / the provided user has the correct privileges. """ success = False log.debug(pre_msg) try: connection, cursor = connection_func() try: with connection: result = cursor.execute( "CREATE TABLE CREATE_ME(`id` INT NOT NULL PRIMARY KEY, `name` VARCHAR(48));" ) success = True if result == 0 else False if success: log.debug(post_msg, success) else: log.error(post_msg, success) except MySQLdb.Error as e: log.error("Error during: " + pre_msg) log.error( 'Unable to create a table w/ the provided username. %s: %s', e.args[0], e.args[1]) connection, cursor = connection_func() try: with connection: cursor.execute("DROP TABLE CREATE_ME;") except MySQLdb.Error as e: log.error("Error during: " + pre_msg) log.error( 'Unable to drop the table created to check permissions. %s: %s', e.args[0], e.args[1]) except MySQLdb.Error as e: log.error("Error during: " + pre_msg) log.error('Unable to connect to Database. %s: %s', e.args[0], e.args[1]) return success
def append_to_bashrc(resource): # append environment variables to .bashrc log.info("\nAdding AKRR enviroment variables to resource's .bashrc!\n") if akrr.dry_run: return str_io = io.StringIO() try: sys.stdout = sys.stderr = str_io rsh = akrr.util.ssh.ssh_resource(resource) akrr_header = 'AKRR Remote Resource Environment Variables' out = akrr.util.ssh.ssh_command( rsh, '''if [ -e $HOME/.bashrc ] then if [[ `grep "\#''' + akrr_header + ''' \[Start\]" $HOME/.bashrc` == *"''' + akrr_header + ''' [Start]"* ]] then echo "Updating AKRR record in $HOME/.bashrc, backing to $HOME/.bashrc_akrrbak" cp $HOME/.bashrc $HOME/.bashrc_akrrbak head -n "$(( $(grep -n '\#''' + akrr_header + ''' \[Start\]' $HOME/.bashrc_akrrbak | head -n 1 | cut -d ":" -f 1) - 1 ))" $HOME/.bashrc_akrrbak > $HOME/.bashrc tail -n "+$(( $(grep -n '\#''' + akrr_header + ''' \[End\]' $HOME/.bashrc_akrrbak | head -n 1 | cut -d ":" -f 1) + 1 ))" $HOME/.bashrc_akrrbak >> $HOME/.bashrc fi fi''') log.debug(out) cmds = ('''echo "Appending AKRR records to $HOME/.bashrc"''', '''echo "#''' + akrr_header + ''' [Start]" >> $HOME/.bashrc''', '''echo "export AKRR_NETWORK_SCRATCH=\\"''' + resource['network_scratch'] + '''\\"" >> $HOME/.bashrc''', '''echo "export AKRR_LOCAL_SCRATCH=\\"''' + resource['local_scratch'] + '''\\"" >> $HOME/.bashrc''', '''echo "export AKRR_APPKER_DIR=\\"''' + resource['appkernel_dir'] + '''\\"" >> $HOME/.bashrc''', '''echo "export AKRR_AKRR_DIR=\\"''' + resource['akrr_data'] + '''\\"" >> $HOME/.bashrc''', '''echo "#''' + akrr_header + ''' [End]" >> $HOME/.bashrc''', '''echo "Appending AKRR records to $HOME/.bashrc"''') for cmd in cmds: out = akrr.util.ssh.ssh_command(rsh, cmd) log.debug(out) rsh.close(force=True) del rsh sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ except Exception as e: sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ log.critical( "Can not connect to %s\nProbably invalid credential, see full error report:\n%s", resource['name'], str_io.getvalue()) raise e
def set_default_value_for_unset_vars(): """post process settings""" import os from .util import run_cmd_getoutput from akrr.util import log global which_akrr global akrr_conf global akrr_conf_dir global akrr_home_dir global default_akrr_home_dir global akrr_log_dir global in_source_install global rpm_install global dev_install if which_akrr is None or which_akrr == "akrr": try: which_akrr = run_cmd_getoutput("which akrr").strip() except Exception as e: log.critical("Can not find akrr executable") raise e if os.path.dirname(which_akrr) == "/usr/bin": rpm_install = True if os.path.dirname(which_akrr) == "/usr/local/bin": dev_install = True else: in_source_install = True # set default_akrr_home_dir if in_source_install: default_akrr_home_dir = os.path.abspath( os.path.dirname(os.path.dirname(which_akrr))) elif rpm_install or dev_install: default_akrr_home_dir = os.path.expanduser("~/akrr") if akrr_home_dir is None: akrr_home_dir = default_akrr_home_dir else: akrr_home_dir = os.path.expanduser(akrr_home_dir) akrr_conf_dir = os.path.join(akrr_home_dir, "etc") akrr_conf = os.path.join(akrr_home_dir, "etc", 'akrr.conf') akrr_log_dir = os.path.join(akrr_home_dir, "log") log.debug("AKRR conf dir and log dir locations:\n" " akrr_home: {}\n" " akrr_conf: {}\n" " akrr_conf_dir: {}\n" " akrr_log_dir: {}\n" "".format(akrr_home_dir, akrr_conf, akrr_conf_dir, akrr_log_dir))
def _detect_network(self): out = self.openstack.run_open_stack_cmd("server list -f json --name " + self.name) out = json.loads(out.strip()) if len(out) == 0: raise Exception("Openstack server didn't start!") out = out[0] s = out["Networks"] all_ips = s[s.find('=') + 1:].replace(',', ' ').split() self.internal_network_ip = all_ips[0] log.debug("internal_network_ip: " + self.internal_network_ip) if len(all_ips) > 1: self.flexible_ip = all_ips[-1] log.debug("flexible_ip: " + self.flexible_ip) self.ip = all_ips[-1]
def _detect_network(self): out = self.cloud_cli.run_cloud_cmd("server list -f json --name " + self.name) out = json.loads(out.strip()) if len(out) == 0: raise Exception("Openstack server didn't start!") out = out[0] s = out["Networks"] all_ips = s[s.find('=') + 1:].replace(',', ' ').split() # Get only ip4 all_ips = [ip for ip in all_ips if ip.count(".") > 0] self.internal_network_ip = all_ips[0] log.debug("internal_network_ip: "+self.internal_network_ip) if len(all_ips) > 1: self.flexible_ip = all_ips[-1] log.debug("flexible_ip: "+self.flexible_ip) self.ip = all_ips[-1]
def retrieve_resources_from_xdmod(): """ Retrieve the applicable contents of the `modw`.`resourcefact` table. :return: a tuple of strings containing the name of the resources. """ con, cur = akrr.db.get_xd_db() if con is None: # i.e. AKRR running without modw return tuple() cur.execute("SELECT `name`,`id` FROM `modw`.`resourcefact`") rows = cur.fetchall() log.debug("Retrieved %s Resource records...", len(rows) if rows else 0) return rows
def remove_tasks_state_dumps(self, days_old: int, resources=None, appkernels=None) -> None: """ remove tasks state dumps """ resources = get_list_from_comma_sep_values(resources) appkernels = get_list_from_comma_sep_values(appkernels) log.info("Removing tasks state dumps") log.debug("resources filter: " + str(resources)) log.debug("appkernels filter: " + str(appkernels)) log.debug("days: " + str(days_old)) log.debug("dry_run: " + str(self.dry_run)) log.debug("comp_task_dir: " + str(self.comp_task_dir)) timenow = datetime.datetime.now() seconds_in_day = 24 * 3600 count = 0 for task_dir in self.get_tasks_dir_list(resources, appkernels): try: time_stamp = os.path.basename(task_dir) activate_time = time_stamp_to_datetime(time_stamp) days_passed = (timenow - activate_time).total_seconds() / seconds_in_day if days_passed < days_old: continue proc_dir = os.path.join(task_dir, "proc") if not os.path.isdir(proc_dir): continue for state_file in os.listdir(proc_dir): if _state_dump.match(state_file) is None: continue log.debug2(" delete:", state_file) state_file_fullpath = os.path.join(proc_dir, state_file) count += 1 if not self.dry_run: os.remove(state_file_fullpath) except: log.error("Cannot process: " + task_dir) log.info("Removed %d task state dumps" % count)
def check_r_db(connection_func, pre_msg, post_msg): """ Check that the user has the correct privileges to the database at the end of the connection provided by 'connection_func'. Specifically checking for read permissions. :type connection_func function :type pre_msg str :type post_msg str :param connection_func: the function that will provide a (connection, cursor) tuple. :param pre_msg: a message to be provided to the user before the checks begin. :param post_msg: a message to be provided to the user after the checks are successful :return: true if the database is available / the provided user has the correct privileges. """ success = False log.debug(pre_msg) try: connection, cursor = connection_func() try: with connection: result = cursor.execute( "SELECT COUNT(*) FROM `modw`.`resourcefact`;") success = True if result >= 0 else False if success: log.debug(post_msg, success) else: log.error("Error during: " + pre_msg) log.error(post_msg, success) except MySQLdb.Error as e: log.error("Error during: " + pre_msg) log.error('Unable to select from `modw`.`resourcefact`. %s: %s', e.args[0], e.args[1]) except MySQLdb.Error as e: log.error("Error during: " + pre_msg) log.error('Unable to connect to Database. %s: %s', e.args[0], e.args[1]) return success
def archive_tasks(self, days_old: int, resources=None, appkernels=None) -> None: """ archive old task """ resources = get_list_from_comma_sep_values(resources) appkernels = get_list_from_comma_sep_values(appkernels) log.info("Archiving tasks") log.debug("resources filter: " + str(resources)) log.debug("appkernels filter: " + str(appkernels)) log.debug("days: " + str(days_old)) log.debug("dry_run: " + str(self.dry_run)) log.debug("comp_task_dir: " + str(self.comp_task_dir)) time_now = datetime.datetime.now() seconds_in_day = 24 * 3600 count = 0 task_dirs = self.get_tasks_dir_list(resources, appkernels) n_task_dirs = len(task_dirs) progress_update = max(int(round(n_task_dirs / 50)), 1) for i in range(n_task_dirs): task_dir = task_dirs[i] if i % progress_update == 0: progress_bar(i / n_task_dirs) try: time_stamp = os.path.basename(task_dir) activate_time = time_stamp_to_datetime(time_stamp) days_passed = (time_now - activate_time).total_seconds() / seconds_in_day if days_passed < days_old: continue out = tarfile.open(task_dir + '.tar.gz', mode='w|gz') out.add(task_dir, time_stamp) out.close() shutil.rmtree(task_dir) count += 1 except: log.error("Cannot process: " + task_dir) progress_bar() log.info("Archived %d tasks" % count)
def remove_tasks_workdir(self, days_old: int, resources=None, appkernels=None) -> None: """ remove tasks state dumps """ resources = get_list_from_comma_sep_values(resources) appkernels = get_list_from_comma_sep_values(appkernels) log.info("Removing tasks workdir") log.debug("resources filter: " + str(resources)) log.debug("appkernels filter: " + str(appkernels)) log.debug("days: " + str(days_old)) log.debug("dry_run: " + str(self.dry_run)) log.debug("comp_task_dir: " + str(self.comp_task_dir)) timenow = datetime.datetime.now() seconds_in_day = 24 * 3600 count = 0 for task_dir in self.get_tasks_dir_list(resources, appkernels): try: time_stamp = os.path.basename(task_dir) activate_time = time_stamp_to_datetime(time_stamp) days_passed = (timenow - activate_time).total_seconds() / seconds_in_day if days_passed < days_old: continue workdir_dir = os.path.join(task_dir, "jobfiles", "workdir") if not os.path.isdir(workdir_dir): continue if log.verbose: print("Found workdir:", workdir_dir) count += 1 if not self.dry_run: shutil.rmtree(workdir_dir) except: log.error("Cannot process: " + task_dir) log.info("Removed %d task workdirs" % count)
def _get_app_execution_method(app_on_resource_cfg_filename: str, default: str = None) -> Optional[str]: """ read app kernel execution method from app_on_resource_cfg_filename if app_on_resource_cfg_filename do not exists return default """ if not os.path.isfile(app_on_resource_cfg_filename): log.debug("While checking execution_method:" "Application kernel configuration file do not exists (%s)!" "It might be ok.", app_on_resource_cfg_filename) return default with open(app_on_resource_cfg_filename, "rt") as fin: line = fin.readline() while line: m = re.match(r"execution_method\s*=\s*", line) if m: var = {} exec(line, var) return var['execution_method'] line = fin.readline() return default
def expectSendline(self, pattern, cmd, timeoutMessage="EOF or TIMEOUT", replaceCMD=True, addAfter=True, **kwargs): """custom expect helper. It added EOF and TIMEOUT patterns and raise excemption with timeoutMessage message in match case. It also print to stdout the output if verbosity>=3. If a pattern is list return index of match If pattern is not list return output cleared from special symbols""" p = [pexpect.EOF, pexpect.TIMEOUT] if type(pattern) is list: p += pattern else: p.append(pattern) imatch = self.expect(p, **kwargs) output = self.getCmdOutput(clearSpecialSymbols=False, addAfter=addAfter, replaceCMD=False) if hasattr(self, 'output'): self.output += output log.debug(output) if imatch == 0 or imatch == 1: msg = timeoutMessage if hasattr( self, 'timeoutMessage') and timeoutMessage == "EOF or TIMEOUT": msg = self.timeoutMessage raise ExpectTimeout(msg) self.lastcmd = cmd + "\n" self.sendline(cmd) if type(pattern) is list: return imatch - 2 else: return self.getCmdOutput(clearSpecialSymbols=True, addAfter=addAfter, replaceCMD=replaceCMD)
def run_test_job(resource, app_name="test", nodes=2): log.info( "Will send test job to queue, wait till it executed and will analyze the output" ) log.debug("Will use AKRR REST API at {}".format( akrrrestclient.restapi_host)) check_connection_to_rest_api() if akrr.dry_run: return task_id = check_if_test_job_already_submitted(resource, app_name) if task_id is None: task_id = submit_test_job(resource, app_name, nodes) monitor_test_job(task_id) analyse_test_job_results(task_id, resource, app_name) os.remove(get_test_job_lock_filename(resource, app_name))
def _check_user_db_priv_on_dbserver(user: str, password: str, host: str, port: int, db_name: str, priv: str) \ -> Tuple[bool, bool, bool]: """ Check if user and database already exists and privileges are ok Returns: user_exists, db_exists, user_rights_are_correct """ # check if user, db already there try: # connect with provided user, Exception will raise if user can not connect _, cur = get_con_to_db(user, password, host, port) client_host = get_db_client_host(cur) user_exists = True db_exists = db_exist(cur, db_name) if not db_exists: log.debug("Database %s doesn't exists on %s", db_name, host) user_rights_are_correct = db_check_priv(cur, db_name, priv, user, client_host) if not user_rights_are_correct: log.debug( "User %s doesn't have right privilege on %s, should be %s", user, db_name, priv) except MySQLdb.Error: user_exists = False db_exists = False user_rights_are_correct = False log.debug("User (%s) does not exists on %s", user, host) return user_exists, db_exists, user_rights_are_correct
def remove( db_akrr=False, db_appkernel=False, db_modw=False, db_user=False, conf_dir=False, log_dir=False, bashrc=False, crontab=False, crontab_remove_mailto=False, **kwargs): log.debug( "Removal options for removal:\n" " db_akrr: {}\n" " db_appkernel: {}\n" " db_modw: {}\n" " db_user: {}\n" " conf_dir: {}\n" " log_dir: {}\n" " bashrc: {}\n" " crontab: {} , crontab_remove_mailto: {}\n" "".format( db_akrr, db_appkernel, db_modw, db_user, conf_dir, log_dir, bashrc, crontab, crontab_remove_mailto) ) _stop_akrr() log.debug2("Unused keyword arguments: {}".format(kwargs)) if db_user: _remove_user() if db_akrr: _remove_akrr_db() if db_appkernel: _remove_ak_db() if db_modw: _remove_modw_db() if conf_dir: _remove_conf_dir() if log_dir: _remove_log_dir() if bashrc: _remove_from_bashrc() if crontab: _remove_from_crontab(crontab_remove_mailto)
def remove_task_from_remote_queue(self): sh = None try: from string import Template m_kill_expression = kill_expressions[self.resource['batch_scheduler']] cmd = Template(m_kill_expression[0]).substitute(jobId=str(self.RemoteJobID)) msg = ssh.ssh_resource(self.resource, cmd) log.debug(msg) self.set_method_to_run_next( "task_is_complete", "Task is probably removed from remote queue.", copy.deepcopy(msg)) return None except Exception as e: log.exception("Got exception in process_results_old: %s\n%s\n", e, traceback.format_exc()) if sh is not None: sh.sendline("exit") sh.close(force=True) del sh self.set_method_to_run_next( None, "ERROR Can not remove job from queue on remote resource", traceback.format_exc()) self.fatal_errors_count += 1 return active_task_default_attempt_repeat
def _detect_network(self): if self.network_tier is not None: while True: out = self.cloud_cli.run_cloud_cmd( f"compute --project={self.project} instances describe --zone={self.zone} {self.name} --format=json") try: out = json.loads(out.strip()) except json.JSONDecodeError: raise Exception("server didn't start!") if "networkInterfaces" in out and len(out["networkInterfaces"]) > 0 and \ "accessConfigs" in out["networkInterfaces"][0] and \ len(out["networkInterfaces"][0]["accessConfigs"])>0 and \ "natIP" in out["networkInterfaces"][0]["accessConfigs"][0] and \ out["networkInterfaces"][0]["accessConfigs"][0]["natIP"] != "": break else: time.sleep(1) self.internal_network_ip = out["networkInterfaces"][0]["networkIP"] self.ip = out["networkInterfaces"][0]["accessConfigs"][0]["natIP"] self.flexible_ip = self.ip log.debug("flexible_ip: "+self.flexible_ip)
def _set_env(self): # get location of _env_set_script if self._which_env_set_script is None and self._env_set_script is not None: if os.path.isfile(self._env_set_script): self._which_env_set_script = self._env_set_script else: self._which_env_set_script = self._shell.run_command("which " + self._env_set_script) self._which_env_set_script = self._which_env_set_script.strip() if self._which_env_set_script.endswith(self._env_set_script): self._which_env_set_script = os.path.expanduser(self._which_env_set_script) log.debug("_which_env_set_script: " + self._which_env_set_script) # check presence of _which_env_set_script if self._which_env_set_script is not None and not os.path.isfile(self._which_env_set_script): msg = "Can not find google cloud environment setup script: " + self._env_set_script msg += "\n" + self._which_env_set_script log.error(msg) raise FileNotFoundError(msg) # set environment if self._which_env_set_script is not None: self.run_cmd("source " + self._which_env_set_script)
def _remove_from_crontab(remove_mailto=False): """remove from cron""" try: crontab_content = subprocess.check_output("crontab -l", shell=True) except subprocess.CalledProcessError: log.error("Can not run crontab -l") return new_crontab = False crontab_content = crontab_content.decode("utf-8").splitlines(True) with open(os.path.expanduser('.crontmp'), 'w') as f: for l in crontab_content: not_akrr = True if l.count('akrr') > 0 and (l.count('checknrestart.sh') > 0 or l.count('restart.sh') > 0): not_akrr = False if remove_mailto and l.count('MAILTO') > 0: not_akrr = False if not_akrr: f.write(l) else: new_crontab = True if new_crontab: log.info("AKRR Section present in crontab. Cleaning crontab.") try: if not dry_run: output = subprocess.check_output("crontab .crontmp", shell=True).decode("utf-8") log.debug(output) else: log.info("DRY RUN: should run `crontab .crontmp`. .crontmp:" + open(".crontmp", "rt").read()) except subprocess.CalledProcessError: log.error("Can not run crontab .crontmp") os.remove(".crontmp") else: log.info("There was no AKRR records detected in crontab list")
def cursor_execute(cur, query, args=None, dry_run=None): """Execute database affecting command if not in dry run mode""" if dry_run is None: import akrr dry_run = akrr.dry_run sql_command = query.split() sql_command = sql_command[0].lower() if len(sql_command) > 0 else "UNKNOWN" sql_non_modified_command = sql_command in ('select', 'show') if log.verbose or dry_run: if args is not None: if isinstance(args, dict): args_filled = dict((key, cur.connection.literal(item)) for key, item in args.items()) else: args_filled = tuple(map(cur.connection.literal, args)) query_filled = query % args_filled else: query_filled = query if dry_run: log.dry_run("SQL: " + query_filled) else: log.debug("SQL: " + query_filled) if sql_non_modified_command or not dry_run: cur.execute(query, args)
def update_layout(self, resources=None, appkernels=None) -> None: """ update resource/appkernel/task to resource/appkernel/year/month/task layout """ resources = get_list_from_comma_sep_values(resources) appkernels = get_list_from_comma_sep_values(appkernels) log.info("Updating layout") log.debug("resources filter: " + str(resources)) log.debug("appkernels filter: " + str(appkernels)) log.debug("dry_run: " + str(self.dry_run)) log.debug("comp_task_dir: " + str(self.comp_task_dir)) timenow = datetime.datetime.now() seconds_in_day = 24 * 3600 count = 0 for task_dir in self.get_tasks_dir_list(resources, appkernels, old_layout_only=True): try: time_stamp = os.path.basename(task_dir) activate_time = time_stamp_to_datetime(time_stamp) year = str(activate_time.year) month = "%02d" % activate_time.month year_month_dir = os.path.join(os.path.dirname(task_dir), year, month) if log.verbose: print("Move:", task_dir) print(" to:", year_month_dir) count += 1 if not self.dry_run: os.makedirs(year_month_dir, exist_ok=True) shutil.move(task_dir, year_month_dir) except: log.error("Cannot process: " + task_dir) import traceback traceback.print_exc() log.info("Moved %d task dirs" % count)
def copy_exec_sources_and_inputs(rsh, resource): """Copy exec sources and inputs to remote resource""" log.info( "Preparing to copy application signature calculator,\n" " app. kernel input files and \n" " HPCC, IMB, IOR and Graph500 source code to remote resource\n") try: akrr.util.ssh.ssh_command(rsh, "cd %s" % resource['appkernel_dir']) out = akrr.util.ssh.ssh_command(rsh, "ls " + resource['appkernel_dir']) files_in_appker_dir = out.strip().split() if not ("inputs" in files_in_appker_dir or "inputs/" in files_in_appker_dir): log.info("Copying app. kernel input tarball to %s", resource['appkernel_dir']) if not akrr.dry_run: akrr.util.ssh.scp_to_resource( resource, cfg.appker_repo_dir + "/inputs.tar.gz", resource['appkernel_dir']) log.info("Unpacking app. kernel input files to %s/inputs", resource['appkernel_dir']) if not akrr.dry_run: out = akrr.util.ssh.ssh_command( rsh, "tar xvfz %s/inputs.tar.gz" % resource['appkernel_dir']) log.debug(out) out = akrr.util.ssh.ssh_command( rsh, "du -h %s/inputs" % resource['appkernel_dir']) log.debug(out) if out.count("No such file or directory") == 0: log.info("App. kernel input files are in %s/inputs\n", resource['appkernel_dir']) else: raise Exception("files are not copied!") else: log.warning_count += 1 log.warning( "WARNING %d: App. kernel inputs directory %s/inputs is present, assume they are correct.\n", log.warning_count, resource['appkernel_dir']) if not ("execs" in files_in_appker_dir or "execs/" in files_in_appker_dir): log.info( "Copying app. kernel execs tarball to %s\n" % (resource['appkernel_dir']) + "It contains HPCC,IMB,IOR and Graph500 source code and app.signature calculator" ) if not akrr.dry_run: akrr.util.ssh.scp_to_resource( resource, cfg.appker_repo_dir + "/execs.tar.gz", resource['appkernel_dir']) log.info( "Unpacking HPCC,IMB,IOR and Graph500 source code and app.signature calculator files to %s/execs", resource['appkernel_dir']) if not akrr.dry_run: out = akrr.util.ssh.ssh_command( rsh, "tar xvfz %s/execs.tar.gz" % resource['appkernel_dir']) log.debug(out) out = akrr.util.ssh.ssh_command( rsh, "df -h %s/execs" % resource['appkernel_dir']) log.debug(out) if out.count("No such file or directory") == 0: log.info( "HPCC,IMB,IOR and Graph500 source code and app.signature calculator are in %s/execs\n", resource['appkernel_dir']) else: raise Exception("files are not copied!") else: log.warning_count += 1 log.warning( "WARNING %d: App. kernel executables directory %s/execs is present, assume they are correct.", log.warning_count, resource['appkernel_dir']) log.warning( "It should contain HPCC,IMB,IOR and Graph500 source code and app.signature calculator\n" ) akrr.util.ssh.ssh_command(rsh, "rm execs.tar.gz inputs.tar.gz") except Exception as e: log.critical("Can not copy files to %s", resource['name']) raise e