Exemple #1
0
def setup_ptgenerator_slurm(model, c, hpc="getafix"):
    if hpc is None:
        raise ValueError(
            "HPC environment variable is not set. Please set it to an hpc system, like export HPC=nersc"
        )
    config = get_config()
    hpc_config = config.get("hpcs", {}).get(hpc.lower(), {})
    job_path = os.path.join(os.path.dirname(inspect.stack()[0][1]),
                            f"jobscripts/slurm_pt_generator_{hpc}.job")
    python_path = os.path.abspath(os.path.dirname(inspect.stack()[0][1]))
    unique_name = model.__class__.__name__ + "_" + ("".join(
        [k + str(c[k]) for k in sorted(c.keys())])) + ".job"
    job_dir = os.path.abspath("jobs")
    output = os.path.join(job_dir, "zlog")
    d = {
        "name": unique_name,
        "account": hpc_config["account"],
        "conda_env": hpc_config["conda_env"],
        "mpi_module": hpc_config["mpi_module"],
        "fort_compile_module": hpc_config["fort_compile_module"],
        "path": python_path,
        "output": output,
        "model": model.__class__.__name__,
    }
    with open(job_path) as f:
        raw_template = f.read()
    d.update(c)
    template = raw_template.format(**d)

    filename = os.path.join(job_dir, unique_name)
    os.makedirs(job_dir, exist_ok=True)
    with open(filename, "w") as f:
        f.write(template)
    logging.info(f"Submitting regen for {filename}")
    os.system(f"{config['hpc_submit_command']} {filename}")
Exemple #2
0
def setup_ptgenerator_slurm(model, c):
    config = get_config()
    job_path = os.path.join(os.path.dirname(inspect.stack()[0][1]),
                            "jobscripts/slurm_pt_generator.job")
    python_path = os.path.abspath(os.path.dirname(inspect.stack()[0][1]))
    unique_name = model.__class__.__name__ + "_" + ("".join(
        [k + str(c[k]) for k in sorted(c.keys())])) + ".job"
    job_dir = os.path.abspath("jobs")
    output = os.path.join(job_dir, "zlog")
    d = {
        "partition": config["job_partition"],
        "name": unique_name,
        "conda_env": config["job_conda_env"],
        "mpi_module": config["mpi_module"],
        "fort_compile_module": config["fort_compile_module"],
        "path": python_path,
        "output": output,
        "model": model.__class__.__name__,
    }
    with open(job_path) as f:
        raw_template = f.read()
    d.update(c)
    template = raw_template.format(**d)

    filename = os.path.join(job_dir, unique_name)
    os.makedirs(job_dir, exist_ok=True)
    with open(filename, "w") as f:
        f.write(template)
    logging.info(f"Submitting regen for {filename}")
    os.system(f"{config['hpc_submit_command']} {filename}")
Exemple #3
0
def write_jobscript_slurm(filename, name=None, num_tasks=24, num_concurrent=24, delete=False, hpc="getafix"):

    if hpc is None:
        raise ValueError("HPC environment variable is not set. Please set it to an hpc system, like export HPC=nersc")

    config = get_config()
    directory = os.path.dirname(os.path.abspath(filename))
    executable = os.path.basename(filename)
    if name is None:
        name = executable[:-3]
    output_dir = directory + os.sep + "out_files"
    q_dir = directory + os.sep + "job_files"
    if not os.path.exists(q_dir):
        os.makedirs(q_dir, exist_ok=True)
    if delete and os.path.exists(output_dir):
        logging.debug("Deleting %s" % output_dir)
        shutil.rmtree(output_dir)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir, exist_ok=True)

    # Factor in jobs executing multiple fits
    hpc_config = config.get("hpcs", {}).get(hpc, {})
    if hpc_config.get("num_fits_per_job", 1) >= num_tasks:
        hpc_config["num_fits_per_job"] = num_tasks
        num_tasks = 1
    else:
        num_tasks = int(np.ceil(num_tasks / hpc_config.get("num_fits_per_job", 1)))
    d = {
        "directory": directory,
        "executable": executable,
        "name": name,
        "output_dir": output_dir,
        "num_concurrent": num_concurrent,
        "num_tasks": num_tasks,
    }
    d.update(hpc_config)

    slurm_job = os.path.join(os.path.dirname(os.path.abspath(inspect.stack()[0][1])), f"jobscripts/slurm_fit_{hpc}.job")
    with open(slurm_job) as f:
        raw_template = f.read()
    template = raw_template.format(**d)

    n = "%s/%s.q" % (q_dir, executable[: executable.index(".py")])
    with open(n, "w") as f:
        f.write(template)
    logging.info("SLURM Jobscript at %s" % n)
    return n
Exemple #4
0
    def fit(self, file):
        num_concurrent = self.get_num_concurrent()

        num_jobs = self.get_num_jobs()
        num_models = len(self.model_datasets)
        self.logger.info(
            f"With {num_models} models+datasets and {self.num_walkers} walkers, "
            f"have {num_jobs} jobs")

        if self.is_local():
            # Only do the first model+dataset on a local computer as a test
            self.logger.info("Running locally on the 0th index.")
            self._run_fit(0, 0)
        else:
            if len(sys.argv) == 1:
                # if launching the job for the first time
                if os.path.exists(self.temp_dir):
                    if self.remove_output:
                        self.logger.info("Deleting %s" % self.temp_dir)
                        shutil.rmtree(self.temp_dir)
                hpc = get_hpc()
                filename = write_jobscript_slurm(file,
                                                 name=os.path.basename(file),
                                                 num_tasks=self.get_num_jobs(),
                                                 num_concurrent=num_concurrent,
                                                 delete=False,
                                                 hpc=hpc)
                self.logger.info("Running batch job at %s" % filename)
                config = get_config()
                os.system(f"{config['hpc_submit_command']} {filename}")
            else:
                # or if running a specific fit to a model+dataset pair
                if sys.argv[1].isdigit():
                    index = int(sys.argv[1])
                else:
                    index = -1
                if index != -1 and index < self.get_num_jobs():
                    mi, wi = self._get_indexes_from_index(index)
                    self.logger.info(
                        "Running model_dataset %d, walker number %d" %
                        (mi, wi))
                    self._run_fit(mi, wi)
Exemple #5
0
 def is_local(self):
     return shutil.which(get_config()["hpc_determining_command"]) is None