def make_job_script(cfg, job_name): """Creates the job script. Once all jobs in the array are launched, the job script can be created. It is a bash script that takes the job index as a single argument. This index is set in the bsub command as '$LSB_JOBINDEX', which bsub sets as the actual index when launching that job in the array. This script is super simple - it is just a giant case statement that switches on the job index to run that specific job. This preferred over creating individual scripts for each job which incurs additional file I/O overhead when the scratch area is on NFS, causing a slowdown. Returns the path to the job script. """ lines = ["#!/usr/bin/env bash\nset -e\n"] # Activate the python virtualenv if it exists. if Launcher.pyvenv: lines += ["source {}/bin/activate\n".format(Launcher.pyvenv)] lines += ["case $1 in\n"] for job in LsfLauncher.jobs[cfg][job_name]: # Redirect the job's stdout and stderr to its log file. cmd = "{} > {} 2>&1".format(job.deploy.cmd, job.deploy.get_log_path()) lines += [" {})\n".format(job.index), " {};;\n".format(cmd)] # Throw error as a sanity check if the job index is invalid. lines += [ " *)\n", " echo \"ERROR: Illegal job index: $1\" 1>&2; exit 1;;\n", "esac\n" ] if Launcher.pyvenv: lines += ["deactivate\n"] job_script = Path(LsfLauncher.jobs_dir[cfg], job_name) try: with open(job_script, "w", encoding='utf-8') as f: f.writelines(lines) except IOError as e: err_msg = "ERROR: Failed to write {}:\n{}".format(job_script, e) LsfLauncher._post_finish_job_array(cfg, job_name, err_msg) raise LauncherError(err_msg) log.log(VERBOSE, "[job_script]: %s", job_script) return job_script
def prepare_workspace(project, repo_top, args): '''Overrides Launcher.prepare_workspace.''' # Since we dispatch to remote machines, a project specific python # virtualenv is exists, needs to be activated when launching the job. Launcher.set_python_venv(project) if Launcher.python_venv is None: return # Python_venv needs to be a valid tarfile. Extract it in the scratch # area if it does not exist. It is upto the user to delete it if it is # stale. if tarfile.is_tarfile(Launcher.python_venv): path = Path(args.scratch_root, Path(Launcher.python_venv).stem) if not path.is_dir(): with tarfile.open(Launcher.python_venv, mode='r') as tar: tar.extractall(path=args.scratch_root) Launcher.python_venv = path else: raise LauncherError("{} is not a valid tar file".format( Launcher.python_venv))
def _do_launch(self): # Update the shell's env vars with self.exports. Values in exports must # replace the values in the shell's env vars if the keys match. exports = os.environ.copy() if self.deploy.exports: exports.update(self.deploy.exports) # Clear the magic MAKEFLAGS variable from exports if necessary. This # variable is used by recursive Make calls to pass variables from one # level to the next. Here, self.cmd is a call to Make but it's # logically a top-level invocation: we don't want to pollute the flow's # Makefile with Make variables from any wrapper that called dvsim. if 'MAKEFLAGS' in exports: del exports['MAKEFLAGS'] self._dump_env_vars(exports) args = shlex.split(self.deploy.cmd) try: f = open(self.deploy.get_log_path(), "w", encoding="UTF-8", errors="surrogateescape") f.write("[Executing]:\n{}\n\n".format(self.deploy.cmd)) f.flush() self.process = subprocess.Popen(args, bufsize=4096, universal_newlines=True, stdout=f, stderr=f, env=exports) except subprocess.SubprocessError as e: raise LauncherError('IO Error: {}\nSee {}'.format( e, self.deploy.get_log_path())) finally: self._close_process() self._link_odir("D")
def _do_launch(self): # Add self to the list of jobs. job_name = self.deploy.job_name cfg = self.deploy.sim_cfg job_total = len(LsfLauncher.jobs[cfg][job_name]) # The actual launching of the bsub command cannot happen until the # Scheduler has dispatched ALL jobs in the array. if self.index < job_total: return job_script = self.make_job_script(cfg, job_name) # Update the shell's env vars with self.exports. Values in exports must # replace the values in the shell's env vars if the keys match. exports = os.environ.copy() if self.deploy.exports: exports.update(self.deploy.exports) # Clear the magic MAKEFLAGS variable from exports if necessary. This # variable is used by recursive Make calls to pass variables from one # level to the next. Here, self.cmd is a call to Make but it's # logically a top-level invocation: we don't want to pollute the flow's # Makefile with Make variables from any wrapper that called dvsim. if 'MAKEFLAGS' in exports: del exports['MAKEFLAGS'] self._dump_env_vars(exports) # TODO: Arbitrarily set the max slot-limit to 100. job_array = "{}[1-{}]".format(job_name, job_total) if job_total > 100: job_array += "%100" # TODO: This needs to be moved to a HJson. if self.deploy.sim_cfg.tool == "vcs": job_rusage = "\'rusage[vcssim=1,vcssim_dynamic=1:duration=1]\'" elif self.deploy.sim_cfg.tool == "xcelium": job_rusage = "\'rusage[xcelium=1,xcelium_dynamic=1:duration=1]\'" else: job_rusage = None # Launch the job array. cmd = [ "bsub", # TODO: LSF project name could be site specific! "-P", cfg.project, "-J", job_array, "-oo", "{}.%I.out".format(job_script), "-eo", "{}.%I.out".format(job_script) ] if job_rusage: cmd += ["-R", job_rusage] cmd += ["/usr/bin/bash {} $LSB_JOBINDEX".format(job_script)] try: p = subprocess.run(cmd, check=True, timeout=60, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=exports) except subprocess.CalledProcessError as e: # Need to mark all jobs in this range with this fail pattern. err_msg = e.stderr.decode("utf-8").strip() self._post_finish_job_array(cfg, job_name, err_msg) raise LauncherError(err_msg) # Extract the job ID. result = p.stdout.decode("utf-8").strip() job_id = result.split('Job <')[1].split('>')[0] if not job_id: self._post_finish_job_array(cfg, job_name, "Job ID not found!") raise LauncherError(err_msg) for job in LsfLauncher.jobs[cfg][job_name]: job.bsub_out = Path("{}.{}.out".format(job_script, job.index)) job.job_id = "{}[{}]".format(job_id, job.index) job._link_odir("D")