def setup_ptgenerator_slurm(model, c, hpc="getafix"): if hpc is None: raise ValueError( "HPC environment variable is not set. Please set it to an hpc system, like export HPC=nersc" ) config = get_config() hpc_config = config.get("hpcs", {}).get(hpc.lower(), {}) job_path = os.path.join(os.path.dirname(inspect.stack()[0][1]), f"jobscripts/slurm_pt_generator_{hpc}.job") python_path = os.path.abspath(os.path.dirname(inspect.stack()[0][1])) unique_name = model.__class__.__name__ + "_" + ("".join( [k + str(c[k]) for k in sorted(c.keys())])) + ".job" job_dir = os.path.abspath("jobs") output = os.path.join(job_dir, "zlog") d = { "name": unique_name, "account": hpc_config["account"], "conda_env": hpc_config["conda_env"], "mpi_module": hpc_config["mpi_module"], "fort_compile_module": hpc_config["fort_compile_module"], "path": python_path, "output": output, "model": model.__class__.__name__, } with open(job_path) as f: raw_template = f.read() d.update(c) template = raw_template.format(**d) filename = os.path.join(job_dir, unique_name) os.makedirs(job_dir, exist_ok=True) with open(filename, "w") as f: f.write(template) logging.info(f"Submitting regen for {filename}") os.system(f"{config['hpc_submit_command']} {filename}")
def setup_ptgenerator_slurm(model, c): config = get_config() job_path = os.path.join(os.path.dirname(inspect.stack()[0][1]), "jobscripts/slurm_pt_generator.job") python_path = os.path.abspath(os.path.dirname(inspect.stack()[0][1])) unique_name = model.__class__.__name__ + "_" + ("".join( [k + str(c[k]) for k in sorted(c.keys())])) + ".job" job_dir = os.path.abspath("jobs") output = os.path.join(job_dir, "zlog") d = { "partition": config["job_partition"], "name": unique_name, "conda_env": config["job_conda_env"], "mpi_module": config["mpi_module"], "fort_compile_module": config["fort_compile_module"], "path": python_path, "output": output, "model": model.__class__.__name__, } with open(job_path) as f: raw_template = f.read() d.update(c) template = raw_template.format(**d) filename = os.path.join(job_dir, unique_name) os.makedirs(job_dir, exist_ok=True) with open(filename, "w") as f: f.write(template) logging.info(f"Submitting regen for {filename}") os.system(f"{config['hpc_submit_command']} {filename}")
def write_jobscript_slurm(filename, name=None, num_tasks=24, num_concurrent=24, delete=False, hpc="getafix"): if hpc is None: raise ValueError("HPC environment variable is not set. Please set it to an hpc system, like export HPC=nersc") config = get_config() directory = os.path.dirname(os.path.abspath(filename)) executable = os.path.basename(filename) if name is None: name = executable[:-3] output_dir = directory + os.sep + "out_files" q_dir = directory + os.sep + "job_files" if not os.path.exists(q_dir): os.makedirs(q_dir, exist_ok=True) if delete and os.path.exists(output_dir): logging.debug("Deleting %s" % output_dir) shutil.rmtree(output_dir) if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) # Factor in jobs executing multiple fits hpc_config = config.get("hpcs", {}).get(hpc, {}) if hpc_config.get("num_fits_per_job", 1) >= num_tasks: hpc_config["num_fits_per_job"] = num_tasks num_tasks = 1 else: num_tasks = int(np.ceil(num_tasks / hpc_config.get("num_fits_per_job", 1))) d = { "directory": directory, "executable": executable, "name": name, "output_dir": output_dir, "num_concurrent": num_concurrent, "num_tasks": num_tasks, } d.update(hpc_config) slurm_job = os.path.join(os.path.dirname(os.path.abspath(inspect.stack()[0][1])), f"jobscripts/slurm_fit_{hpc}.job") with open(slurm_job) as f: raw_template = f.read() template = raw_template.format(**d) n = "%s/%s.q" % (q_dir, executable[: executable.index(".py")]) with open(n, "w") as f: f.write(template) logging.info("SLURM Jobscript at %s" % n) return n
def fit(self, file): num_concurrent = self.get_num_concurrent() num_jobs = self.get_num_jobs() num_models = len(self.model_datasets) self.logger.info( f"With {num_models} models+datasets and {self.num_walkers} walkers, " f"have {num_jobs} jobs") if self.is_local(): # Only do the first model+dataset on a local computer as a test self.logger.info("Running locally on the 0th index.") self._run_fit(0, 0) else: if len(sys.argv) == 1: # if launching the job for the first time if os.path.exists(self.temp_dir): if self.remove_output: self.logger.info("Deleting %s" % self.temp_dir) shutil.rmtree(self.temp_dir) hpc = get_hpc() filename = write_jobscript_slurm(file, name=os.path.basename(file), num_tasks=self.get_num_jobs(), num_concurrent=num_concurrent, delete=False, hpc=hpc) self.logger.info("Running batch job at %s" % filename) config = get_config() os.system(f"{config['hpc_submit_command']} {filename}") else: # or if running a specific fit to a model+dataset pair if sys.argv[1].isdigit(): index = int(sys.argv[1]) else: index = -1 if index != -1 and index < self.get_num_jobs(): mi, wi = self._get_indexes_from_index(index) self.logger.info( "Running model_dataset %d, walker number %d" % (mi, wi)) self._run_fit(mi, wi)
def is_local(self): return shutil.which(get_config()["hpc_determining_command"]) is None