def kill_job(api, job_id): """Kill a job job until it is dead.""" job = get_job(api, job_id) if job["state"] in ["CANCELLED", "COMPLETED", "FAILED", "TIMEOUT"]: print('%s is already dead' % job_id) else: kill_command = "scancel %s" % job_id while True: try: hu.subprocess_call(kill_command) print('%s CANCELLING...' % job_id) except Exception as e: if "Socket timed out" in str(e): print("scancel time out and retry now") time.sleep(1) continue break # confirm cancelled job = get_job(api, job_id) while job["state"] != "CANCELLED": time.sleep(2) job = get_job(api, job_id) print('%s now is dead.' % job_id)
def kill_job(job_id): """Kill a job job until it is dead.""" kill_command = "scancel %s" % job_id while True: try: hu.subprocess_call(kill_command) # no return message after scancel except Exception: print("scancel time out and retry now") time.sleep(1) continue break return
def launch_job(self, exp_dict, savedir, command, job=None): """Submit a job job and save job dict and exp_dict.""" add_job_utils() import haven_jobs_utils as hju # Check for duplicates if job is not None: assert self._assert_no_duplicates(job) hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict) # Define paths workdir_job = os.path.join(savedir, "code") # Copy the experiment code into the experiment folder hu.copy_code(self.workdir + "/", workdir_job) # Run command job_command = hju.get_job_command(self.job_config, command, savedir, workdir=workdir_job) job_id = hu.subprocess_call(job_command).replace("\n", "") # Verbose if self.verbose: print("Job_id: %s command: %s" % (job_id, command)) job_dict = {"job_id": job_id, "started at (Montreal)":hu.time_to_montreal(), "command":command} hu.save_json(hju.get_job_fname(savedir), job_dict) return job_dict
def get_existing_slurm_job_commands(exp_list, savedir_base): existing_job_commands = [] for exp_dict in exp_list: exp_id = hu.hash_dict(exp_dict) savedir = os.path.join(savedir_base, exp_id) file_name = os.path.join(savedir, "job_dict.json") if not os.path.exists(file_name): continue job_dict = hu.load_json(file_name) job_id = job_dict["job_id"] job_status = hu.subprocess_call( "scontrol show job %s" % job_id).split("JobState=")[1].split(" ")[0] if job_status == "RUNNING" or job_status == "PENDING": existing_job_commands += [job_dict["command"]] return existing_job_commands
def get_jobs(user_name): # account_id = hu.subprocess_call('eai account get').split('\n')[-2].split(' ')[0] """ get the first 3 jobs""" command = "squeue --user=%s" % user_name while True: try: job_list = hu.subprocess_call(command) job_list = job_list.split("\n") job_list = [v.lstrip().split(" ")[0] for v in job_list[1:]] result = [] for job_id in job_list: result.append(get_job(job_id)) except Exception: print("scontrol time out and retry now") time.sleep(1) continue break return result
def get_job(job_id): """Get job information.""" command = "scontrol show job %s" % job_id job_info = "" while True: try: job_info = hu.subprocess_call(command) job_info = job_info.replace("\n", "") job_info = { v.split("=")[0]: v.split("=")[1] for v in job_info.split(" ") if "=" in v } except Exception: print("scontrol time out and retry now") time.sleep(1) continue break return job_info
def get_job(job_id): """Get job information.""" command = "scontrol show job %s" % job_id job_info = '' while True: try: job_info = hu.subprocess_call(command) job_info = job_info.replace('\n', '') job_info = { v.split('=')[0]: v.split('=')[1] for v in job_info.split(' ') if '=' in v } except: print("scontrol time out and retry now") time.sleep(1) continue break return job_info
def get_jobs(api, account_id): ''' get all jobs launched by the current user''' job_list = "" command = "squeue --user=%s --format=\"%%.18i %%.8T\"" % getpass.getuser() while True: try: job_list = hu.subprocess_call(command) except Exception as e: if "Socket timed out" in str(e): print("squeue time out and retry now") time.sleep(1) continue break result = [{ "job_id": j.split()[0], "state": j.split()[1] } for j in job_list.split('\n')[1:-1]] return result
def get_jobs_dict(api, job_id_list, query_size=20): if len(job_id_list) == 0: return {} jobs_dict = {} command = "sacct --jobs=%s --format=jobid,cputime,state" % str( job_id_list)[1:-1].replace(" ", "") while True: try: job_list = hu.subprocess_call(command) except Exception as e: if "Socket timed out" in str(e): print("sacct time out and retry now") time.sleep(1) continue break lines = job_list.split('\n') header = lines[0].split() lines = [l.split() for l in lines[2:-1]] df = pd.DataFrame(data=lines, columns=header) df = df[~df["JobID"].str.contains(r"\.")] df = df.rename(mapper={ "State": "state", "CPUTime": "cpuTime", "JobID": "job_id" }, axis=1) df = df.replace({"state": r"CANCELLED.*"}, {"state": "CANCELLED"}, regex=True) df.insert(loc=0, column="runs", value="") # use job id as key new_df = df.drop(labels="job_id", axis=1) new_df.index = df["job_id"].to_list() jobs_dict = new_df.to_dict(orient="index") return jobs_dict
def submit_job(api, account_id, command, job_config, workdir, savedir_logs=None): # read slurm setting lines = "#! /bin/bash \n" lines += "#SBATCH --account=%s \n" % account_id for key in list(job_config.keys()): lines += "#SBATCH --%s=%s \n" % (key, job_config[key]) path_log = os.path.join(savedir_logs, "logs.txt") lines += "#SBATCH --output=%s \n" % path_log path_err = os.path.join(savedir_logs, "err.txt") lines += "#SBATCH --error=%s \n" % path_err path_code = os.path.join(savedir_logs, "code") lines += "#SBATCH --chdir=%s \n" % path_code lines += command file_name = os.path.join(savedir_logs, "bash.sh") hu.save_txt(file_name, lines) # launch the exp submit_command = "sbatch %s" % file_name while True: try: job_id = hu.subprocess_call(submit_command).split()[-1] except Exception as e: if "Socket timed out" in str(e): print("slurm time out and retry now") time.sleep(1) continue break # delete the bash.sh os.remove(file_name) return job_id
def submit_job(command, savedir): # read slurm setting lines = "#! /bin/bash \n" # if job_config is not None: # lines += "#SBATCH --account=%s \n" % job_configs.ACCOUNT_ID # for key in list(job_config.keys()): # lines += "#SBATCH --%s=%s \n" % (key, job_config[key]) lines += "#SBATCH --account=%s \n" % job_configs.ACCOUNT_ID for key in list(job_configs.JOB_CONFIG.keys()): lines += "#SBATCH --%s=%s \n" % (key, job_configs.JOB_CONFIG[key]) path_log = os.path.join(savedir, "logs.txt") path_err = os.path.join(savedir, "err.txt") lines += "#SBATCH --output=%s \n" % path_log lines += "#SBATCH --error=%s \n" % path_err lines += command file_name = os.path.join(savedir, "bash.sh") hu.save_txt(file_name, lines) # launch the exp submit_command = "sbatch %s" % file_name while True: try: job_id = hu.subprocess_call(submit_command).split()[-1] except Exception: print("slurm time out and retry now") time.sleep(1) continue break # save the command and job id in job_dict.json job_dict = {"command": command, "job_id": job_id} hu.save_json(os.path.join(savedir, "job_dict.json"), job_dict) # delete the bash.sh os.remove(file_name) return job_id
lines = ("#! /bin/bash \n" "#SBATCH --account=%s \n" "#SBATCH --time=%s \n" "#SBATCH --mem-per-cpu=%s \n" "%s") % ( account, time, mem_cpu, command, ) hu.save_txt(file_name, lines) if __name__ == "__main__": # specify the slurm script to run parser = argparse.ArgumentParser() parser.add_argument("--batch", required=True) args = parser.parse_args() submit_command = "sbatch " + args.batch # step 1 - run a job through slurm using `hu.subprocess_call` job_id = hu.subprocess_call(submit_command).split()[-1] # step 2 - get the status of the job from the job_id get_command = "squeue --job %s" % job_id job_status = hu.subprocess_call(get_command) # step 3 - kill the job kill_command = "scancel %s" % job_id info = hu.subprocess_call(kill_command) # no return message after scancel