def submit_job(thecommand): s = Slurm("sync", { "nodes": 92, "ntasks": 4416, "partition": 'skx-normal', "time": "06:00:00" }) s.run(thecommand)
def submit_job(thecommand): s = Slurm( "sync", { "nodes": 81, "ntasks": 3888, "partition": 'skx-normal', "time": "20:00:00", "account": "TG-EAR140030" }) s.run(thecommand)
def submit_job(thecommand): s = Slurm( "sync", { "nodes": N_node, "ntasks": ntasks, "partition": partition, "time": time, "account": account }) s.run(thecommand)
def submit_job(thecommand): s = Slurm( "process_sync", { "nodes": N_node, "ntasks": N_cores, "time": "04:00:00", "cpus-per-task": 1, "mem-per-cpu": "2G" }) s.run(thecommand)
def submit_job(thecommand): s = Slurm( "sync", { "nodes": 28, "ntasks": 1344, "partition": 'skx-normal', "time": "06:00:00", "account": "TG-EAR130011" }) s.run(thecommand)
def run_parallel(ks, param_combinations, partition='low'): '''run each parameter combination in parallel (requires slurmpy package) ''' from slurmpy import Slurm s = Slurm("fit_mog", {"partition": partition, "time": "4-0"}) for i in range(len(param_combinations)): param_str = f'module load python; python3 {opj(DIR_FILE, "sim_gaussian_mixture.py")} ' for j, key in enumerate(ks): param_str += '--' + key + ' ' + str(param_combinations[i][j]) + ' ' print(f'scheduled: {param_str}\n\t({i}/{len(param_combinations)})') s.run(param_str)
def submit_sbatch(template, conf): logging.warning(hist_fn) logging.info('Starting batch submission') if "benchmark" in conf: write_bench_start(conf["benchmark"]) submit_func = "sbatch" rand_hash = "" #gen_hash(template) job_id = '${SLURM_JOB_ID}' program_start = configure(conf, job_id, rand_hash) s = Slurm(conf["name"], conf["SLURM_CONF_GLOBAL"]) conf["DRIVER"]["mstr_bench"] = conf["COMPUTE"]["mstr_bench"] logging.info('Command to be executed: %s', conf["DRIVER"]["program"]) job_id = s.run(template, cmd_kwargs=conf["DRIVER"], _cmd=submit_func) job_id = str(job_id) logging.info('Batch job ID: %s', job_id) condition = True time.sleep(5) while condition: p = Popen(["squeue", "-j", job_id], stdout=PIPE, stderr=PIPE) (out, err) = p.communicate() out = str(out, 'utf-8') logging.debug("Squeue output: %s", out) out = out.split(os.linesep) out.pop(0) queue = [ l.strip().split(' ')[0] for l in out if l.strip().split(' ') != '' ] condition = job_id in queue if condition: logging.info('Job still running, sleeping for 5 mins') time.sleep(5 * 60) logging.info('Batch Job terminated') result = 'UNKNOWN' logfile = [ op.join(d, f) for d, s, lf in os.walk(op.abspath('logs')) for f in lf if '{}.err'.format(job_id) in f ] if len(logfile) > 0: logging.info('Driver logfile: %s', logfile[0]) result = job_status(logfile[0]) else: logging.warning('No logfile generated.') if "benchmark" in conf: write_bench_end(conf["benchmark"]) write_bench_result(conf["benchmark"], result)
import itertools from slurmpy import Slurm partition = 'gpu_yugroup' # sweep lambda_reg params_to_vary = { '--reg1': [0, 1e-1, 5e-1, 1e0, 1e1, 1e2, 1e3], '--reg2': [0], } # run s = Slurm("decode", {"partition": partition, "time": "3-0", "gres": "gpu:1"}) ks = sorted(params_to_vary.keys()) vals = [params_to_vary[k] for k in ks] param_combinations = list(itertools.product(*vals)) # list of tuples print(param_combinations) # for param_delete in params_to_delete: # param_combinations.remove(param_delete) # iterate for i in range(len(param_combinations)): param_str = 'module load python; module load pytorch; python ../train.py ' for j, key in enumerate(ks): param_str += key + ' ' + str(param_combinations[i][j]) + ' ' s.run(param_str)
from slurmpy import Slurm # paths and constant values nproc_old = 441 # number of processors used in bin files old_mesh_dir = "/scratch/05880/tg851791/asdf_sync/model_generating/tao_1d_ref_knl/DATABASES_MPI" # the mesh files old_model_dir = "/scratch/05880/tg851791/work/generate_hybrid_v703/gll_work/model/s362ani_good_min_tao_smooth" # the model files model_tags = "vpv,vph,vsv,vsh,eta,qmu,rho" # vlues to generate # output directory output_file = "/scratch/05880/tg851791/work/generate_hybrid_v703/gll_work/ppm/notopo/3d" # region as lon1/lat1/lon2/lat2/dep1/dep2 (eg: if lon1=30, lon2=20, get points like 30, 29, ...) region = "74/0/175/62/0/800" npts = "249/405/321" # number of poins, including the edge points. # use 18*18 cores, can be set anyway you like. (two directions, divide subregions) nproc = "18/18" command = "date;" # check if ../../specfem_gll.jl/src/program/get_ppm_model.jl is actually the path of get_ppm_model.jl command += f"ibrun julia ../../specfem_gll.jl/src/program/get_ppm_model.jl --nproc_old {nproc_old} --old_mesh_dir {old_mesh_dir} --old_model_dir {old_model_dir} --model_tags {model_tags} --output_file {output_file} --region {region} --npts {npts} --nproc {nproc};" command += "date;" # run 2h18min for my region, 60d*60d, 336*336NEX 21*21 proc. It's safe to set a longer time. s = Slurm("ppm", {"partition": "normal", "nodes": 5, "ntasks": 324, "time": "04:00:00", "account": "TG-EAR130011"}) s.run(command)
def wrapper(*args, **kwargs): ctx = click.get_current_context() ctx.grand_parent = ctx.parent.parent if not ctx.grand_parent.params["sbatch"]: return ctx.invoke(func, *args, **kwargs) run = ctx.grand_parent.params["run"] if run: click.secho("SBATCH MODE! Submitting to SLURM queue.", fg="green") directory = ctx.parent.params.get("directory") if not directory: raise EmmetCliError( f"{ctx.parent.command_path} needs --directory option!") track_dir = os.path.join(directory, ".emmet") if run and not os.path.exists(track_dir): os.mkdir(track_dir) logger.debug(f"{track_dir} created") bb = ctx.grand_parent.params["bb"] yes = ctx.grand_parent.params["yes"] if bb: if not yes: click.confirm("Did you run `module unload esslurm`?", abort=True) subdir = directory.rsplit(os.sep, 1)[1] stage_in = f"#DW stage_in source={directory} " stage_in += f"destination=$DW_JOB_STRIPED/{subdir} type=directory" script = [ "#DW jobdw capacity=10TB access_mode=striped type=scratch", stage_in, "srun hostname", "", ] command = "\n".join(script) slurm_kwargs = { "qos": "premium", "nodes": 1, "tasks-per-node": 1, "constraint": "haswell", "time": "48:00:00", } else: if not yes: click.confirm("Did you run `module load esslurm`?", abort=True) slurm_kwargs = { "qos": "xfer", "time": "48:00:00", "licenses": "SCRATCH", "mem": "30GB", } command = "" s = Slurm( ctx.command_path.replace(" ", "-"), slurm_kwargs=slurm_kwargs, date_in_name=False, scripts_dir=track_dir, log_dir=track_dir, bash_strict=False, ) command += reconstruct_command(sbatch=True) slurmpy_stderr = io.StringIO() with contextlib.redirect_stderr(slurmpy_stderr): s.run(command, _cmd="sbatch" if run else "cat", tries=1) # 6 days ret = slurmpy_stderr.getvalue()[2:-1] logger.info("\n" + ret.encode("utf-8").decode("unicode_escape")) # TODO add jobid to SUBMITTED.value return ReturnCodes.SUBMITTED if run else ReturnCodes.SUCCESS
models = [ 'random_forest', 'gradient_boosting', 'skope_rules', 'rulefit', 'fplasso', 'fpskope', 'grl', 'oner', 'brs' ] extra_args = '' if args.ignore_cache: extra_args += ' --ignore_cache' if args.low_data: extra_args += ' --low_data' if not args.test_only: for model in models: s.run(f'python experiments/compare_models.py --model {model} --cv' + extra_args) brl_job_ids = [] for i in range(26): job_id = s.run( f'python experiments/compare_models.py --model brl --parallel_id {i} --cv' + extra_args) brl_job_ids.append(job_id) # s.run(f'python experiments/combine.py --model brl', depends_on=brl_job_ids) if not args.val_only: models += [ 'stbl_l2_mm0', 'stbl_l2_mm1', 'stbl_l2_mm2', 'stbl_l1_mm0', 'stbl_l1_mm1', 'stbl_l1_mm2' ]
from slurmpy import Slurm # * background+tao/crust1.0 -> background+tao/crust1.0 -> background+tao/crust1.0+min/crust2.0 print("start job 4") nproc_old = 336 old_mesh_dir = "/work/05880/tg851791/stampede2/specfem/20190115/tao_h_files/DATABASES_MPI" old_model_dir = "/scratch/05880/tg851791/binfile/perturbation/tao_perturbation_smooth" nproc_new = 441 new_mesh_dir = "/work/05880/tg851791/stampede2/specfem/20190115/simulation_taoreg/DATABASES_MPI" new_model_dir = "/scratch/05880/tg851791/binfile/interp/s362ani_addmin-pert" model_tags = ",".join(["vph", "vpv", "vsh", "vsv", "eta", "qmu", "rho"]) output_dir = "/scratch/05880/tg851791/binfile/interp/s362ani_addmin_addtao-pert" command4 = f"ibrun julia src/program/xsem_interp_mesh2.jl --nproc_old {nproc_old} --old_mesh_dir {old_mesh_dir} --old_model_dir {old_model_dir} --nproc_new {nproc_new} --new_mesh_dir {new_mesh_dir} --new_model_dir {new_model_dir} --model_tags {model_tags} --output_dir {output_dir}" s4 = Slurm("bg+tao", { "partition": "skx-normal", "nodes": 10, "ntasks": 441, "time": "00:60:00" }) jobid_s4 = s4.run(command4)
def schedule(config: dict, name_addition: str = None): """ Schedules a given configuration as a new job Args: config (dict): job configuration name_addition (str, optional): Defaults to None. Addition to the job name Raises: RuntimeError: When requested scheduler is not available """ executer = config['scheduler']['type'].lower() if executer not in ['slurm', 'bash']: logger.error("Only SLURM or bash are supported at the moment!") raise RuntimeError("Unsupported Job Manager!") # If a host entry matches replace the found parameters if 'host' in config['scheduler']: hostname = socket.getfqdn() logger.debug("Hostname: " + hostname) if hostname in config['scheduler']['host']: logger.debug("Found host entry for this hostname") for k, v in config['scheduler']['host'][hostname][ 'parameters'].items(): config['scheduler']['parameters'][k] = v # Create Slurm job script, allow empty parameters try: parameters = { i: config['scheduler']['parameters'][i] for i in config['scheduler']['parameters'] } except KeyError: parameters = {} # Check if a log directory is set log_directory = None if 'log-directory' in config['script']: log_directory = config['script']['log-directory'] # Check for job name job_name = "ace" if 'job-name' in config['scheduler']: job_name = config['scheduler']['job-name'] job = Slurm(job_name, parameters, log_directory=log_directory) body = config['script']['body'] env_vars = [] auto_args = [] # Add evn var with job id env_vars.append("jobId=" + name_addition) for k, v in config['script']['parameters'].items(): # Check if variable already set if k in os.environ: logger.warning(k + " environment variable already set!") # Set env variable if type(v) is dict: env_vars.append(k + "=\"" + v['values'] + "\"") else: env_vars.append(k + "=\"" + v + "\"") auto_args.append("--" + k + "=${" + k + "}") # Create auto_args if 'auto_args' in os.environ: logger.warning("auto_args environment variable already set!") env_vars.append("") # Add a new line between args and auto_args env_vars.append("auto_args=\"" + " ".join(auto_args) + "\"") # Handle times keyword prefix = '' suffix = '' if 'times' in config['script']: prefix = "for run in {1.." + config['script']['times'] + "}\ndo\n\n\n" suffix = "done" # Handle before_script before_script = '' if 'before_script' in config: before_script = config['before_script'] # Handle after_script after_script = '' if 'after_script' in config: after_script = config['after_script'] # Join body body = before_script + "\n\n" + prefix + \ "\n".join(env_vars) + "\n\n\n" + body + \ "\n\n\n" + suffix + "\n\n" + after_script # Schedule job script if executer == 'bash': config['jobid'] = job.run(body, _cmd='bash', name_addition=name_addition) else: config['jobid'] = job.run(body, name_addition=name_addition)
'work', work_dir + cdn_ident) cmd += ["--cli", "--work_dir", work_dir] s = Slurm("incrementation", slurm_conf) if cdn["framework"] == "spark": cmd = " ".join(cmd) cmd = "\"{}\"".format(cmd) print("Submitting command: ", cmd) s.run("bash " + spark_template, cmd_kwargs={"spscript": cmd, "parallelism": slurm_conf["cpus-per-task"]}, _cmd=sys.argv[2]) else: with open(legends[cdn["dataset"]]) as legend: images = legend.read().split() num_images = len(images) pn_images = num_images/num_nodes pn_remain = num_images % num_nodes count_rem = 0 idx = 0 for i in range(0, num_images - pn_remain, pn_images): files = None if count_rem < pn_remain:
from slurmpy import Slurm # ! change setting first!!! # per_s362ani_good + per_tao -> per_s362ani_good_tao nproc_old = 336 old_mesh_dir = "/scratch/05880/tg851791/work/generate_hybrid_v703/gll_work/control_file/tao" old_model_dir = "/scratch/05880/tg851791/work/generate_hybrid_v703/gll_work/perturbation/per_tao" nproc_new = 324 new_mesh_dir = "/scratch/05880/tg851791/work/generate_small_v703/specfem/s362ani_good/DATABASES_MPI" new_model_dir = "/scratch/05880/tg851791/work/generate_small_v703/perturbation/per_s362ani_good" model_tags = ",".join(["vph", "vpv", "vsh", "vsv", "eta", "qmu", "rho"]) output_dir = "/scratch/05880/tg851791/work/generate_small_v703/perturbation/per_s362ani_good_tao" command1 = f"ibrun julia ../specfem_gll.jl/src/program/xsem_interp_mesh2.jl --nproc_old {nproc_old} --old_mesh_dir {old_mesh_dir} --old_model_dir {old_model_dir} --nproc_new {nproc_new} --new_mesh_dir {new_mesh_dir} --new_model_dir {new_model_dir} --model_tags {model_tags} --output_dir {output_dir}" s = Slurm("interp", {"partition": "skx-normal", "nodes": 10, "ntasks": 324, "time": "00:60:00", "account": "TG-EAR140030"}) s.run(f"date; {command1}; date;")
samples = read_samplesheet(args.sample, args.project) if args.mkref: jobids = [] for sample in samples: fastq = samples[sample] run_bwa = align_and_convert(config, fastq, args, sample) bwa = Slurm("bwaAln-{}".format(sample), { "account": config["slurm"]["account"], "partition": "node", "time": config["slurm"]["time"] }, log_dir="{}/logs".format(args.out), scripts_dir="{}/scripts".format(args.out)) jobids.append(bwa.run(run_bwa)) wcxmkref = Slurm("wcxmkref", { "account": config["slurm"]["account"], "partition": "node", "time": config["slurm"]["time"] }, log_dir="{}/logs".format(args.out), scripts_dir="{}/scripts".format(args.out)) wcxmkref.run(mkref(config, args), depends_on=jobids) elif args.mkmodel: f = open("{}.PREFACE.config.tab".format(args.out.rstrip("/")), "w") f.write("ID\tfilepath\tgender\tFF\n") for sample in samples: for line in open("{}/{}/{}.AMYCNE.tab".format(args.out, sample,
def main(): parser = argparse.ArgumentParser(description='Pilot-Agent scheduling for SLURM') parser.add_argument('template', type=str, help="SLURM batch script template") parser.add_argument('params', type=argparse.FileType('r'), help="SLURM batch script params (JSON)") parser.add_argument('-y', '--yarn', action='store_true', help="Yarn scheduler will be used") parser.add_argument('-D', '--no_submit', action='store_true', help="Create but do not submit sbatch scripts" ) args = parser.parse_args() conf = None with args.params as f: conf = json.load(f) if args.yarn and 'COMPUTE' in os.environ: open(op.join(os.environ['HADOOP_HOME'], 'etc/hadoop/slaves'), 'w').close() elif args.yarn: open(op.join(conf["COMPUTE"]["HADOOP_HOME"], 'etc/hadoop/slaves'), 'w').close() submit_func = "bash" if args.no_submit else "sbatch" s = Slurm(conf["name"], conf["SLURM_CONF_GLOBAL"]) program_start = datetime.now().strftime("%Y-%m-%d") rand_hash = '{0}-{1}'.format(hashlib.sha1(args.template.encode("utf-8")).hexdigest(), hashlib.md5(os.urandom(16)).hexdigest()) job_id = rand_hash if args.no_submit else '${SLURM_JOB_ID}' if not "COMPUTE" in conf: conf["COMPUTE"] = {} if not "mstr_bench" in conf["COMPUTE"]: conf["COMPUTE"]["mstr_bench"] = op.join(conf["logdir"], "master-{0}-benchmarks.{1}.out".format(program_start, job_id)) if not "mstr_log" in conf["COMPUTE"]: conf["COMPUTE"]["mstr_log"] = op.join(conf["logdir"], "master-{0}-{1}.out".format(program_start, rand_hash)) conf["COMPUTE"]["mstr_lock"] = op.join(conf["logdir"], "master-{0}-{1}.lock".format(program_start, rand_hash)) conf["COMPUTE"]["logdir"] = conf["logdir"] # if you want to run one master and worker locally, might as well submit to local rm_nnodes = 1 if args.no_submit else 0 for i in range(conf["num_nodes"] - rm_nnodes): # SLURM batch submit workers if args.no_submit: thread = threading.Thread(target=s.run, kwargs=dict(command=args.template, cmd_kwargs=conf["COMPUTE"], _cmd=submit_func)) thread.daemon = True thread.start() else: s.run(args.template, name_addition=rand_hash, cmd_kwargs=conf["COMPUTE"], _cmd=submit_func) while conf["num_nodes"] - rm_nnodes > 0 and not op.isfile(conf["COMPUTE"]["mstr_log"]): time.sleep(5) if conf["num_nodes"] - rm_nnodes > 0: master_url = "" with open(conf["COMPUTE"]["mstr_log"], 'r') as f: master_url = f.readline().strip('\n') program = None driver_out = op.join(conf["logdir"], "driver-{0}-{1}.out".format(program_start, rand_hash)) if not args.no_submit: fw = open(driver_out, "wb") fr = open(driver_out, "r") p = Popen(conf["DRIVER"]["slurm_alloc"], stdin = PIPE, stdout = fw, stderr = fw, bufsize = 1) for module in conf["DRIVER"]["modules"]: p.stdin.write("module load {}\n".format(module).encode('utf-8')) p.stdin.write("echo start $(date +%s.%N)\n".encode('utf-8')) program = ("spark-submit --master {0} --executor-cores=${{SLURM_CPUS_PER_TASK}} " "--executor-memory=${{SLURM_MEM_PER_NODE}}M --driver-memory=${{SLURM_MEM_PER_NODE}}M {1}\n") \ .format(master_url, conf["DRIVER"]["program"]) p.stdin.write(program.encode('utf-8')) out = fr.read() p.stdin.write("echo end $(date +%s.%N)\n".encode('utf-8')) p.stdin.write("echo 'SUCCEEDED' >> {}".format(conf["COMPUTE"]["mstr_log"]).encode('utf-8')) fw.close() fr.close() elif conf["num_nodes"] == 1: program = ("spark-submit --master local[*] {}\n").format(conf["DRIVER"]["program"]) p = Popen(program.split(), stdout = PIPE, stderr = PIPE) stdin, stderr = p.communicate() print(stdin, stderr) else: program = ("spark-submit --master {0} {1}\n").format(master_url, conf["DRIVER"]["program"]) p = Popen(program.split(), stdout = PIPE, stderr = PIPE) stdin, stderr = p.communicate() print(stdin, stderr)
def cluster(tool, invocation, clowdrloc, dataloc, cluster, **kwargs): """cluster Launches a pipeline locally through the Clowdr wrappers. Parameters ---------- tool : str Path to a boutiques descriptor for the tool to be run invocation : str Path to a boutiques invocation for the tool and parameters to be run clowdrloc : str Path for storing Clowdr intermediate files and outputs dataloc : str Path for accessing input data. If local, provide the hostname and optionally a path. If on S3, provide an S3 path. cluster : str Scheduler on the cluster being used. Currently, the only supported mode is slurm. **kwargs : dict Arbitrary keyword arguments. Currently supported arguments: - account : str Account for the cluster scheduler - jobname : str Base-name for the jobs as they will appear in the scheduler - verbose : bool Toggle verbose output printing - dev : bool Toggle dev mode (only runs first execution in the specified set) Additionally, transfers all keyword arguments accepted by both of "controller.metadata.consolidateTask" and "task.processTask" Returns ------- int The exit-code returned by the task being executed """ # TODO: scrub inputs tool = utils.truepath(tool) if kwargs.get("simg"): kwargs["simg"] = utils.truepath(kwargs["simg"]) from slurmpy import Slurm if kwargs.get("verbose"): print("Consolidating metadata...") [tasks, invocs] = metadata.consolidateTask(tool, invocation, clowdrloc, dataloc, **kwargs) if kwargs.get("dev"): tasks = [tasks[0]] # Just launch the first task in dev taskdir = op.dirname(utils.truepath(tasks[0])) try: os.mkdir(taskdir) except FileExistsError: pass os.chdir(taskdir) with open(tool) as fhandle: container = json.load(fhandle).get("container-image") if container: if kwargs.get("verbose"): print("Getting container...") outp = utils.getContainer(taskdir, container, **kwargs) if kwargs.get("verbose"): print(outp) jobname = kwargs.get("jobname") if kwargs.get("jobname") else "clowdrtask" slurm_args = {} if kwargs.get("slurm_args"): for opt in kwargs.get("slurm_args").split(","): k, v = opt.split(":")[0], opt.split(":")[1:] v = ":".join(v) slurm_args[k] = v job = Slurm(jobname, slurm_args) script = "clowdr run {} -c {} --local" if kwargs.get("workdir"): script += " -w {}".format(kwargs["workdir"]) if kwargs.get("volumes"): script += " ".join( [" -v {}".format(vol) for vol in kwargs.get("volumes")]) for task in tasks: job.run(script.format(task, taskdir)) if kwargs.get("verbose"): print(taskdir) return taskdir
if cdn["filesystem"] != "mem": work_dir = op.join(filesystems[cdn["filesystem"]], 'work', work_dir + cdn_ident) cmd += ["--cli", "--work_dir", work_dir] s = Slurm("incrementation", slurm_conf) if cdn["framework"] == "spark": cmd = " ".join(cmd) cmd = "\"{}\"".format(cmd) print("Submitting command: ", cmd) s.run("bash " + spark_template, cmd_kwargs={"spscript": cmd}, _cmd=sys.argv[2]) else: with open(legends[cdn["dataset"]]) as legend: images = legend.read().split() num_images = len(images) pn_images = num_images / num_nodes pn_remain = num_images % num_nodes count_rem = 0 idx = 0 for i in range(0, num_images - pn_remain, pn_images): files = None if count_rem < pn_remain: