def test_scp(testconfig, tmpdir): """ Copy file to and from resource """ import os import akrr.util.ssh as ssh sshcfg = testconfig['ssh'] resource = prep_resource_dict(testconfig) content = u"this is test.\ntest is this!\n" sh = ssh.ssh_resource(resource) pwd = ssh.ssh_command(sh, 'pwd').strip() # file_name1 = os.path.join(pwd, "testfile1") # ssh.ssh_command(sh, 'rm -rf '+file_name) p = tmpdir / "testfile1.txt" p.write_text(content, encoding='utf8') ssh.scp_to_resource(resource, str(p), pwd) ssh.ssh_command(sh, "cp testfile1.txt testfile2.txt") p = tmpdir / "testfile2.txt" ssh.scp_from_resource(resource, os.path.join(pwd, "testfile2.txt"), str(p)) assert p.read_text(encoding='utf8').strip() == content.strip()
def test_ssh_access(testconfig): """ tests ssh.ssh_access and ssh_access_multytry """ import akrr.util.ssh as ssh sshcfg = testconfig['ssh'] # run single command assert ssh.ssh_access( sshcfg['host'], ssh='ssh', username=sshcfg['user'], password=sshcfg['password'], private_key_file=sshcfg['private_key_name'], private_key_password=sshcfg['private_key_passphrase'], command='whoami').strip() == sshcfg['user'] # run shell session sh = ssh.ssh_access_multytry( sshcfg['host'], ssh='ssh', username=sshcfg['user'], password=sshcfg['password'], private_key_file=sshcfg['private_key_name'], private_key_password=sshcfg['private_key_passphrase']) assert ssh.ssh_command(sh, 'whoami').strip() == sshcfg['user'] del sh
def test_check_dir(testconfig): import os import akrr.util.ssh as ssh sshcfg = testconfig['ssh'] resource = prep_resource_dict(testconfig) sh = ssh.ssh_resource(resource) assert ssh.ssh_command(sh, 'whoami').strip() == sshcfg['user'] pwd = ssh.ssh_command(sh, 'pwd').strip() dirname = os.path.join(pwd, "testdir1") ssh.ssh_command(sh, 'rm -rf ' + dirname) # this dir is not exists should get None assert ssh.check_dir(sh, dirname, exit_on_fail=False, try_to_create=False)[0] is None # this dir is not exists should create it and return True assert ssh.check_dir(sh, dirname, exit_on_fail=False, try_to_create=True)[0] is True # this dir exists should return True assert ssh.check_dir(sh, dirname, exit_on_fail=False, try_to_create=False)[0] is True del sh
def test_ssh_resource(testconfig): """ tests ssh.ssh_access and ssh_access_multytry """ import akrr.util.ssh as ssh sshcfg = testconfig['ssh'] resource = prep_resource_dict(testconfig) # run single command assert ssh.ssh_resource(resource, 'whoami').strip() == sshcfg['user'] # run shell session sh = ssh.ssh_resource(resource) assert ssh.ssh_command(sh, 'whoami').strip() == sshcfg['user'] del sh
server_param = { "cloud_cli": GoogleCloudCLI(), "project": "buffalo-openxdmod", "name": "akrr-test", "zone": "us-central1-a", "machine_type": "e2-highcpu-32", "network_tier": "PREMIUM", "image": "cos-89-16108-470-1", "image_project": "cos-cloud", "boot_disk_size": "30GB", "boot_disk_type": "pd-balanced", "boot_disk_device_name": "akrr_test", "ssh_username": "******", "ssh_private_key_file": os.path.expanduser("~/.ssh/id_rsa_googlcloud"), "docker_username": "******", "docker_password": "" } server = GoogleCloudServer(**server_param) server.create(delete_if_exists=True) rsh = ssh.ssh_access_multytry(server.ip, username=server.ssh_username, private_key_file=server.ssh_private_key_file, number_of_attempts=20, sleep_time=5) ssh.ssh_command(rsh, "docker run -it --rm --shm-size=4g nsimakov/appker:hpcc", 1200) ssh.ssh_command(rsh, "docker run -it --rm --shm-size=4g nsimakov/containers:namd", 1200) #print(out) server.delete()
def check_the_job_on_remote_machine(self): sh = None try: print("### Checking the job status on remote machine") from string import Template m_wait_expr = wait_expressions[self.resource['batch_scheduler']] cmd = Template( m_wait_expr[0]).substitute(jobId=str(self.RemoteJobID)) rege = Template( m_wait_expr[2]).substitute(jobId=str(self.RemoteJobID)) sh = ssh_resource(self.resource) msg = ssh_command(sh, cmd) sh.sendline("exit") sh.close(force=True) del sh sh = None match_obj = m_wait_expr[1](rege, msg, m_wait_expr[3]) if match_obj: print("Still in queue. Either waiting or running") if datetime.datetime.today( ) - self.TimeJobSubmetedToRemoteQueue > self.taskParam.get( 'MaxTimeInQueue', cfg.max_time_in_queue): print("ERROR:") print( "Job exceeds the maximal time in queue (%s). And will be terminated." % (str( self.taskParam.get('MaxTimeInQueue', cfg.max_time_in_queue)))) print("Removing job from remote queue.") self.terminate() print("copying files from remote machine") scp_from_resource(self.resource, os.path.join(self.remoteTaskDir, "*"), os.path.join(self.taskDir, "jobfiles"), "-r") # print msg print("Deleting all files from remote machine") self.delete_remote_folder() self.status = "ERROR: Job exceeds the maximal time in queue (%s) and was terminated." % ( str( self.taskParam.get('MaxTimeInQueue', cfg.max_time_in_queue))) self.status_info = "\nLast Status report:\n" + msg self.ReportFormat = "Error" self.ToDoNextString = "check_if_subtasks_done_proccessing_results" self.update_sub_tasks() # del self.RemoteJobID return datetime.timedelta(seconds=3) self.status = "Still in queue. Either waiting or running" self.status_info = msg return active_task_default_attempt_repeat else: print( "Not in queue. Either exited with error or executed successfully." ) print("copying files from remote machine") scp_from_resource(self.resource, os.path.join(self.remoteTaskDir, "*"), os.path.join(self.taskDir, "jobfiles"), "-r") print("Deleting all files from remote machine") self.delete_remote_folder() self.status = "Not in queue. Either exited with error or executed successfully. " \ "Copied all files to local machine. Deleted all files from remote machine" self.status_info = "Not in queue. Either exited with error or executed successfully. " \ "Copied all files to local machine. Deleted all files from remote machine" self.ToDoNextString = "check_if_subtasks_done_proccessing_results" self.update_sub_tasks() # del self.RemoteJobID self.TimeJobPossiblyCompleted = datetime.datetime.today() return datetime.timedelta(seconds=3) # print msg except: if sh is not None: sh.sendline("exit") sh.close(force=True) del sh self.status = "ERROR Can not check the status of the job on remote resource" self.status_info = traceback.format_exc() self.fatal_errors_count += 1 akrr.util.log.log_traceback(self.status) self.ToDoNextString = "check_the_job_on_remote_machine" return active_task_default_attempt_repeat
def create_batch_job_script_and_submit_it(self): self.JobScriptName = self.appName + ".job" print( "### Creating batch job script and submitting it to remote machine" ) # as a current bypass will create a job script remotely and copy it here # get ssh to remote resource sh = None try: sh = ssh_resource(self.resource) # Create remote directories if needed # akrr_data check_dir(sh, self.resource['akrr_data'], raise_on_fail=True) # dir for app check_dir(sh, os.path.join(self.resource['akrr_data'], self.appName), raise_on_fail=True) # dir for task check_dir(sh, self.remoteTaskDir, raise_on_fail=True) # CheckAndCreateDir(self,sh,os.path.join(self.remoteTaskDir,"batchJob_pl")) # cd to remoteTaskDir ssh_command(sh, "cd %s" % self.remoteTaskDir) # get walltime from DB dbdefaults = {} try: db, cur = akrr.db.get_akrr_db() cur.execute( '''SELECT resource,app,resource_param,app_param FROM active_tasks WHERE task_id=%s ;''', (self.task_id, )) raw = cur.fetchall() (resource, app, resource_param, app_param) = raw[0] cur.execute( """SELECT walltime_limit FROM akrr_default_walltime_limit WHERE resource=%s AND app=%s AND resource_param=%s AND app_param=%s """, (resource, app, resource_param, app_param)) raw = cur.fetchall() if len(raw) > 0: dbdefaults['walltime_limit'] = raw[0][0] # db.commit() cur.close() del db except MySQLError: pass # create job-script batchvars = {} # print "#"*80 for di in [ self.resource, self.app, dbdefaults, self.resourceParam, self.appParam ]: batchvars.update(di) # stack the subtasks sub_task_info = self.get_sub_task_info() if batchvars['shuffleSubtasks']: random.shuffle(sub_task_info) sub_tasks_execution = "" for subtask_id, subtask_status, subtask_datetime_stamp, \ subtask_resource, subtask_app, subtask_task_param in sub_task_info: remote_sub_task_dir = self.get_remote_task_dir( self.resource['akrr_data'], subtask_app, subtask_datetime_stamp) sub_task_job_script_name = self.get_job_script_name( subtask_app) sub_task_job_script_path = os.path.join( remote_sub_task_dir, sub_task_job_script_name) sub_tasks_execution += "cd " + remote_sub_task_dir + "\n" # subTasksExecution+="cp "+os.path.join(self.remoteTaskDir,"job.id ")+"./\n" sub_tasks_execution += "echo Starting " + subtask_app + "\n" sub_tasks_execution += self.resource[ 'shell'] + " " + sub_task_job_script_path + " > stdout 2> stderr\n" sub_tasks_execution += "echo Done with " + subtask_app + "\n" + "\n" batchvars['subTasksExecution'] = sub_tasks_execution # calculate NNodes and NCores if 'nnodes' in batchvars: tmp_num_nodes = batchvars['nnodes'] tmp_num_cores = tmp_num_nodes * batchvars['ppn'] else: tmp_num_cores = batchvars['ncores'] if tmp_num_cores % batchvars['ppn'] == 0: tmp_num_nodes = tmp_num_cores / batchvars['ppn'] else: tmp_num_nodes = (tmp_num_cores / batchvars['ppn']) + 1 batchvars['akrr_num_of_cores'] = tmp_num_cores batchvars['akrr_num_of_nodes'] = tmp_num_nodes # Set batchvars remaps batchvars['akrr_ppn'] = batchvars['ppn'] batchvars['akrrNCoresToBorder'] = batchvars[ 'akrr_ppn'] * batchvars['akrr_num_of_nodes'] batchvars['akrr_task_work_dir'] = self.remoteTaskDir batchvars['akrr_walltime_limit'] = "%02d:%02d:00" % ( int(batchvars['walltime_limit']) / 60, int(batchvars['walltime_limit']) % 60) batchvars['localPATH'] = ssh_command(sh, "echo $PATH").strip() batchvars['akrr_appkernel_name'] = self.app['name'] batchvars['akrr_resource_name'] = self.resource['name'] batchvars['akrr_time_stamp'] = self.timeStamp if batchvars['akrr_num_of_nodes'] == 1: batchvars['akk_ppn_or_cores_on_one_node'] = batchvars[ 'akrr_num_of_cores'] else: batchvars['akk_ppn_or_cores_on_one_node'] = batchvars[ 'akrr_ppn'] if 'node_list_setter_template' not in batchvars: batchvars['node_list_setter_template'] = batchvars[ 'node_list_setter'][batchvars['batch_scheduler']] # process templates batchvars['akrrCommonCommands'] = akrr.util.format_recursively( batchvars['akrr_common_commands_template'], batchvars, keep_double_brackets=True) batchvars['akrrCommonCleanup'] = akrr.util.format_recursively( batchvars['akrr_common_cleanup_template'], batchvars, keep_double_brackets=True) # do parameters adjustment if 'process_params' in batchvars: batchvars['process_params'](batchvars) # generate job script job_script = akrr.util.format_recursively( self.resource["batch_job_template"], batchvars) fout = open( os.path.join(self.taskDir, "jobfiles", self.JobScriptName), "w") fout.write(job_script) fout.close() scp_to_resource( self.resource, os.path.join(self.taskDir, "jobfiles", self.JobScriptName), os.path.join(self.remoteTaskDir)) ssh_command(sh, "cat %s " % self.JobScriptName) # send to queue from string import Template send_to_queue = Template( submit_commands[self.resource['batch_scheduler']]).substitute( scriptPath=self.JobScriptName) msg = ssh_command(sh, send_to_queue) match_obj = re.search( job_id_extract_patterns[self.resource['batch_scheduler']], msg, re.M | re.S) if match_obj: try: job_id = int(match_obj.group(1)) except (ValueError, IndexError): raise AkrrError("Can't get job id. " + msg) else: raise AkrrError("Can't get job id. " + msg) ssh_command(sh, "echo %d > job.id" % job_id) # cp job id to subtasks for subtask_id, subtask_status, subtask_datetime_stamp, subtask_resource, \ subtask_app, subtask_task_param in sub_task_info: remote_sub_task_dir = self.get_remote_task_dir( self.resource['akrr_data'], subtask_app, subtask_datetime_stamp) ssh_command(sh, "cp job.id %s" % remote_sub_task_dir) self.RemoteJobID = job_id self.TimeJobSubmetedToRemoteQueue = datetime.datetime.today() sh.sendline("exit") sh.close(force=True) del sh sh = None print("\nRemoteJobID=", self.RemoteJobID) print("copying files from remote machine") scp_from_resource(self.resource, os.path.join(self.remoteTaskDir, "*"), os.path.join(self.taskDir, "jobfiles"), "-r") # update DB time_submitted_to_queue db, cur = akrr.db.get_akrr_db() cur.execute( '''UPDATE active_tasks SET time_submitted_to_queue=%s WHERE task_id=%s ;''', (datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S"), self.task_id)) cur.close() del db self.set_method_to_run_next( "check_the_job_on_remote_machine", "Created batch job script and have submitted it to remote queue.", "Remote job ID is %d" % self.RemoteJobID) # check first time in 1 minute return datetime.timedelta(days=0, hours=0, minutes=1) except Exception: if sh is not None: sh.sendline("exit") sh.close(force=True) del sh self.set_method_to_run_next( None, "ERROR Can not created batch job script and submit it to remote queue", traceback.format_exc()) if cfg.max_fails_to_submit_to_the_queue >= 0: if hasattr(self, "fails_to_submit_to_the_queue"): self.fails_to_submit_to_the_queue += 1 if self.fails_to_submit_to_the_queue > cfg.max_fails_to_submit_to_the_queue: # Stop execution of the task and submit results to db self.set_method_to_run_next("push_to_db") result_file = os.path.join(self.taskDir, "result.xml") self.write_error_xml(result_file) return datetime.timedelta(seconds=3) else: self.fails_to_submit_to_the_queue = 1 else: self.fatal_errors_count += 1 akrr.util.log.log_traceback(self.status) return cfg.repeat_after_fails_to_submit_to_the_queue
def check_the_job_on_remote_machine(self): sh = None try: print("### Checking the job status on remote machine") from string import Template sh = ssh.ssh_resource(self.resource) # if it is subtask get master task id from job.id file (it should be replaced by master task) if self.RemoteJobID == 0: try: self.RemoteJobID = int( ssh.ssh_command(sh, "cat %s" % (os.path.join(self.remoteTaskDir, "job.id")))) except Exception as e: log.error("Can not get remote job ID: %s", str(e)) self.RemoteJobID = 0 m_wait_expression = wait_expressions[self.resource['batch_scheduler']] cmd = Template(m_wait_expression[0]).substitute(jobId=str(self.RemoteJobID)) rege = Template(m_wait_expression[2]).substitute(jobId=str(self.RemoteJobID)) msg = ssh.ssh_command(sh, cmd) sh.sendline("exit") sh.close(force=True) del sh if self.RemoteJobID == 0: return active_task_default_attempt_repeat match_obj = m_wait_expression[1](rege, msg, m_wait_expression[3]) if match_obj: log.info("Still in queue. Either waiting or running") if datetime.datetime.today() - self.TimeJobSubmetedToRemoteQueue > \ self.taskParam.get('MaxTimeInQueue', cfg.max_time_in_queue): log.error("Job exceeds the maximal time in queue (%s). And will be terminated. " "Removing job from remote queue." % ( str(self.taskParam.get('MaxTimeInQueue', cfg.max_time_in_queue)))) self.terminate() log.info("copying files from remote machine") ssh.scp_from_resource(self.resource, os.path.join(self.remoteTaskDir, "*"), os.path.join(self.taskDir, "jobfiles"), "-r") # print msg log.info("Deleting all files from remote machine") self.delete_remote_folder() self.set_method_to_run_next( "process_results", "ERROR: Job exceeds the maximal time in queue (%s) and was terminated." % str(self.taskParam.get('MaxTimeInQueue', cfg.max_time_in_queue)), "Last Status report:\n" + msg) self.ReportFormat = "Error" # del self.RemoteJobID return datetime.timedelta(seconds=3) self.set_method_to_run_next(None, "Still in queue. Either waiting or running", msg) return active_task_default_attempt_repeat log.info("Not in queue. Either exited with error or executed successfully.") log.info("copying files from remote machine") msg = ssh.scp_from_resource(self.resource, os.path.join(self.remoteTaskDir, "*"), os.path.join(self.taskDir, "jobfiles"), "-r") log.info("Deleting all files from remote machine") self.delete_remote_folder() self.set_method_to_run_next( "process_results", "Not in queue. Either exited with error or executed successfully. " "Copied all files to local machine. Deleted all files from remote machine") self.TimeJobPossiblyCompleted = datetime.datetime.today() return datetime.timedelta(seconds=3) except Exception as e: if hasattr(locals(), 'sh') and sh is not None: sh.sendline("exit") sh.close(force=True) del sh log.error("Got exception during check_the_job_on_remote_machine: %s", e) self.set_method_to_run_next( None, "ERROR Can not check the status of the job on remote resource", traceback.format_exc()) self.fatal_errors_count += 1 akrr.util.log.log_traceback(self.status) return active_task_default_attempt_repeat
def create_batch_job_script_and_submit_it(self, do_not_submit_to_queue=False): self.JobScriptName = self.get_job_script_name(self.appName) log.info("Creating batch job script and submitting it to remote machine") # as a current bypass will create a job script remotely and copy it here # get ssh to remote resource sh = None try: sh = ssh.ssh_resource(self.resource) # akrr_data ssh.check_dir(sh, self.resource['akrr_data'], try_to_create=True) # dir for app ssh.check_dir(sh, os.path.join(self.resource['akrr_data'], self.appName), try_to_create=True) # dir for task ssh.check_dir(sh, self.remoteTaskDir, try_to_create=True) # cd to remoteTaskDir ssh.ssh_command(sh, "cd %s" % self.remoteTaskDir) # generate_batch_job_script self.generate_batch_job_script() ssh.scp_to_resource(self.resource, os.path.join(self.taskDir, "jobfiles", self.JobScriptName), os.path.join(self.remoteTaskDir)) if do_not_submit_to_queue: return ssh.ssh_command(sh, "cat %s " % self.JobScriptName) # send to queue from string import Template job_id = 0 if 'masterTaskID' not in self.taskParam: # i.e. submit to queue only if task is independent send_to_queue = Template(submit_commands[self.resource['batch_scheduler']]).substitute( scriptPath=self.JobScriptName) msg = ssh.ssh_command(sh, send_to_queue) match_obj = re.search(job_id_extract_patterns[self.resource['batch_scheduler']], msg, re.M | re.S) if match_obj: try: job_id = int(match_obj.group(1)) except (ValueError, TypeError, IndexError): raise AkrrError("Can't get job id:\n" + msg) else: raise AkrrError("Can't get job id:\n" + msg) # report if self.resource["gateway_reporting"]: ssh.ssh_command(sh, "module load gateway-usage-reporting") ssh.ssh_command(sh, r'gateway_submit_attributes -gateway_user ' + self.resource[ "gateway_user"] + r''' -submit_time "`date '+%F %T %:z'`" -jobid ''' + str(job_id)) ssh.ssh_command(sh, "echo %d > job.id" % job_id) self.RemoteJobID = job_id self.TimeJobSubmetedToRemoteQueue = datetime.datetime.today() sh.sendline("exit") sh.close(force=True) del sh sh = None print("\nRemoteJobID=", self.RemoteJobID) print("copying files from remote machine") ssh.scp_from_resource(self.resource, os.path.join(self.remoteTaskDir, "*"), os.path.join(self.taskDir, "jobfiles"), "-r") # update DB time_submitted_to_queue db, cur = akrr.db.get_akrr_db() cur.execute('''UPDATE active_tasks SET time_submitted_to_queue=%s WHERE task_id=%s ;''', (datetime.datetime.today().strftime("%Y-%m-%d %H:%M:%S"), self.task_id)) cur.close() del db if 'masterTaskID' not in self.taskParam: # i.e. independent task self.set_method_to_run_next( "check_the_job_on_remote_machine", "Created batch job script and have submitted it to remote queue.", "Remote job ID is %d" % self.RemoteJobID) # check first time in 1 minute return datetime.timedelta(days=0, hours=0, minutes=1) else: # i.e. this is subtask # i.e. dependent task self.set_method_to_run_next( "check_the_job_on_remote_machine", "Created batch job script.", "Created batch job script. Waiting for master task to execute it.") # master task will update the time when it will finish task execution return datetime.timedelta(days=111 * 365) except Exception as e: if sh is not None: sh.sendline("exit") sh.close(force=True) del sh self.set_method_to_run_next( None, "ERROR Can not created batch job script and submit it to remote queue", traceback.format_exc()) log.error("Got exception during attempt to create and submit job: %s", str(e)) if cfg.max_fails_to_submit_to_the_queue >= 0: self.fails_to_submit_to_the_queue += 1 if self.fails_to_submit_to_the_queue > cfg.max_fails_to_submit_to_the_queue or \ (self.taskParam['test_run'] is True and self.fails_to_submit_to_the_queue >= 2): # Stop execution of the task and submit results to db self.set_method_to_run_next("push_to_db") result_file = os.path.join(self.taskDir, "result.xml") self.write_error_xml(result_file) return datetime.timedelta(seconds=3) else: self.fatal_errors_count += 1 akrr.util.log.log_traceback(self.status) return cfg.repeat_after_fails_to_submit_to_the_queue