def submit_job(thecommand): s = Slurm("sync", { "nodes": 92, "ntasks": 4416, "partition": 'skx-normal', "time": "06:00:00" }) s.run(thecommand)
def submit_sbatch(template, conf): logging.warning(hist_fn) logging.info('Starting batch submission') if "benchmark" in conf: write_bench_start(conf["benchmark"]) submit_func = "sbatch" rand_hash = "" #gen_hash(template) job_id = '${SLURM_JOB_ID}' program_start = configure(conf, job_id, rand_hash) s = Slurm(conf["name"], conf["SLURM_CONF_GLOBAL"]) conf["DRIVER"]["mstr_bench"] = conf["COMPUTE"]["mstr_bench"] logging.info('Command to be executed: %s', conf["DRIVER"]["program"]) job_id = s.run(template, cmd_kwargs=conf["DRIVER"], _cmd=submit_func) job_id = str(job_id) logging.info('Batch job ID: %s', job_id) condition = True time.sleep(5) while condition: p = Popen(["squeue", "-j", job_id], stdout=PIPE, stderr=PIPE) (out, err) = p.communicate() out = str(out, 'utf-8') logging.debug("Squeue output: %s", out) out = out.split(os.linesep) out.pop(0) queue = [ l.strip().split(' ')[0] for l in out if l.strip().split(' ') != '' ] condition = job_id in queue if condition: logging.info('Job still running, sleeping for 5 mins') time.sleep(5 * 60) logging.info('Batch Job terminated') result = 'UNKNOWN' logfile = [ op.join(d, f) for d, s, lf in os.walk(op.abspath('logs')) for f in lf if '{}.err'.format(job_id) in f ] if len(logfile) > 0: logging.info('Driver logfile: %s', logfile[0]) result = job_status(logfile[0]) else: logging.warning('No logfile generated.') if "benchmark" in conf: write_bench_end(conf["benchmark"]) write_bench_result(conf["benchmark"], result)
def submit_job(thecommand): s = Slurm( "process_sync", { "nodes": N_node, "ntasks": N_cores, "time": "04:00:00", "cpus-per-task": 1, "mem-per-cpu": "2G" }) s.run(thecommand)
def submit_job(thecommand): s = Slurm( "sync", { "nodes": 28, "ntasks": 1344, "partition": 'skx-normal', "time": "06:00:00", "account": "TG-EAR130011" }) s.run(thecommand)
def submit_job(thecommand): s = Slurm( "sync", { "nodes": N_node, "ntasks": ntasks, "partition": partition, "time": time, "account": account }) s.run(thecommand)
def submit_job(thecommand): s = Slurm( "sync", { "nodes": 81, "ntasks": 3888, "partition": 'skx-normal', "time": "20:00:00", "account": "TG-EAR140030" }) s.run(thecommand)
def run_parallel(ks, param_combinations, partition='low'): '''run each parameter combination in parallel (requires slurmpy package) ''' from slurmpy import Slurm s = Slurm("fit_mog", {"partition": partition, "time": "4-0"}) for i in range(len(param_combinations)): param_str = f'module load python; python3 {opj(DIR_FILE, "sim_gaussian_mixture.py")} ' for j, key in enumerate(ks): param_str += '--' + key + ' ' + str(param_combinations[i][j]) + ' ' print(f'scheduled: {param_str}\n\t({i}/{len(param_combinations)})') s.run(param_str)
def create_job(self, name: str, afterok: list = None, afternotok: list = None) -> Slurm: """Create a job for submitting to SLURM""" LOG.info("Create a slurm job with name %s", name) self.slurm_settings["dependency"] = [] if afterok: self.slurm_settings["dependency"].append("afterok:{}".format( ":".join(str(dependency) for dependency in afterok))) if afternotok: self.slurm_settings["dependency"].append("afternotok:{}".format( ":".join(str(dependency) for dependency in afternotok))) if self.slurm_settings["dependency"]: self.slurm_settings["dependency"] = ",".join( self.slurm_settings["dependency"]) else: del self.slurm_settings["dependency"] job = Slurm( name, self.slurm_settings, log_dir=str(self.log_dir), scripts_dir=str(self.scripts_dir), ) return job
def my_Slurm(*args, cfg_update=dict(), **kwargs): """Shortcut to slurmpy's class; keep certain default kwargs and only update some with kwarg `cfg_update` see https://github.com/brentp/slurmpy """ return Slurm(*args, slurm_kwargs=dict(cluster.slurm_cfg, **cfg_update), log_dir=log_dir, scripts_dir=slurm_scripts_dir, **kwargs)
def create_job(self, name: str) -> Slurm: """Create a job for submitting to SLURM""" LOG.info("Create a slurm job with name %s", name) job = Slurm( name, { "account": self.account, "time": self.time, }, scripts_dir=str(self.scripts_dir), log_dir=str(self.log_dir), ) return job
def submit_locally(template, conf): submit_func = "bash" rand_hash = gen_hash(template) job_id = "" configure(conf, job_id, rand_hash) s = Slurm(conf["name"], conf["SLURM_CONF_GLOBAL"]) master_url = start_workers(s, conf["num_nodes"], conf["COMPUTE"], template, rand_hash, submit_func) program = ["spark-submit", "--master", master_url] if "jars" in conf["DRIVER"]: program.extend(["--jars", conf["DRIVER"]["jars"]]) program.append(conf["DRIVER"]["program"]) p = Popen(program, stdout=PIPE, stderr=PIPE) stdin, stderr = p.communicate() print(stdin, stderr)
from slurmpy import Slurm import argparse partition = 'high' s = Slurm("compare_models", {"partition": partition}) parser = argparse.ArgumentParser() parser.add_argument('--val_only', action='store_true') parser.add_argument('--test_only', action='store_true') parser.add_argument('--ignore_cache', action='store_true') parser.add_argument('--low_data', action='store_true') args = parser.parse_args() models = [ 'random_forest', 'gradient_boosting', 'skope_rules', 'rulefit', 'fplasso', 'fpskope', 'grl', 'oner', 'brs' ] extra_args = '' if args.ignore_cache: extra_args += ' --ignore_cache' if args.low_data: extra_args += ' --low_data' if not args.test_only: for model in models: s.run(f'python experiments/compare_models.py --model {model} --cv' + extra_args)
import itertools from slurmpy import Slurm import numpy as np partition = 'high' kernel_version = True params_to_vary = { 'run': list(range(100)), # should be range(100) } # run s = Slurm("fmri", {"partition": partition, "time": "2-0"}) ks = sorted(params_to_vary.keys()) vals = [params_to_vary[k] for k in ks] param_combinations = list(itertools.product(*vals)) # list of tuples print(len(param_combinations)) ks = np.array(ks) # iterate for i in range(len(param_combinations)): if kernel_version: param_str = 'module load python; python3 ../fmri/run_kernel.py ' else: param_str = 'module load python; python3 ../fmri/run.py ' for j, key in enumerate(ks): param_str += key + ' ' + str(param_combinations[i][j]) + ' ' print(param_str) s.run(param_str)
from slurmpy import Slurm # * background+tao/crust1.0 -> background+tao/crust1.0 -> background+tao/crust1.0+min/crust2.0 print("start job 4") nproc_old = 336 old_mesh_dir = "/work/05880/tg851791/stampede2/specfem/20190115/tao_h_files/DATABASES_MPI" old_model_dir = "/scratch/05880/tg851791/binfile/perturbation/tao_perturbation_smooth" nproc_new = 441 new_mesh_dir = "/work/05880/tg851791/stampede2/specfem/20190115/simulation_taoreg/DATABASES_MPI" new_model_dir = "/scratch/05880/tg851791/binfile/interp/s362ani_addmin-pert" model_tags = ",".join(["vph", "vpv", "vsh", "vsv", "eta", "qmu", "rho"]) output_dir = "/scratch/05880/tg851791/binfile/interp/s362ani_addmin_addtao-pert" command4 = f"ibrun julia src/program/xsem_interp_mesh2.jl --nproc_old {nproc_old} --old_mesh_dir {old_mesh_dir} --old_model_dir {old_model_dir} --nproc_new {nproc_new} --new_mesh_dir {new_mesh_dir} --new_model_dir {new_model_dir} --model_tags {model_tags} --output_dir {output_dir}" s4 = Slurm("bg+tao", { "partition": "skx-normal", "nodes": 10, "ntasks": 441, "time": "00:60:00" }) jobid_s4 = s4.run(command4)
def schedule(config: dict, name_addition: str = None): """ Schedules a given configuration as a new job Args: config (dict): job configuration name_addition (str, optional): Defaults to None. Addition to the job name Raises: RuntimeError: When requested scheduler is not available """ executer = config['scheduler']['type'].lower() if executer not in ['slurm', 'bash']: logger.error("Only SLURM or bash are supported at the moment!") raise RuntimeError("Unsupported Job Manager!") # If a host entry matches replace the found parameters if 'host' in config['scheduler']: hostname = socket.getfqdn() logger.debug("Hostname: " + hostname) if hostname in config['scheduler']['host']: logger.debug("Found host entry for this hostname") for k, v in config['scheduler']['host'][hostname][ 'parameters'].items(): config['scheduler']['parameters'][k] = v # Create Slurm job script, allow empty parameters try: parameters = { i: config['scheduler']['parameters'][i] for i in config['scheduler']['parameters'] } except KeyError: parameters = {} # Check if a log directory is set log_directory = None if 'log-directory' in config['script']: log_directory = config['script']['log-directory'] # Check for job name job_name = "ace" if 'job-name' in config['scheduler']: job_name = config['scheduler']['job-name'] job = Slurm(job_name, parameters, log_directory=log_directory) body = config['script']['body'] env_vars = [] auto_args = [] # Add evn var with job id env_vars.append("jobId=" + name_addition) for k, v in config['script']['parameters'].items(): # Check if variable already set if k in os.environ: logger.warning(k + " environment variable already set!") # Set env variable if type(v) is dict: env_vars.append(k + "=\"" + v['values'] + "\"") else: env_vars.append(k + "=\"" + v + "\"") auto_args.append("--" + k + "=${" + k + "}") # Create auto_args if 'auto_args' in os.environ: logger.warning("auto_args environment variable already set!") env_vars.append("") # Add a new line between args and auto_args env_vars.append("auto_args=\"" + " ".join(auto_args) + "\"") # Handle times keyword prefix = '' suffix = '' if 'times' in config['script']: prefix = "for run in {1.." + config['script']['times'] + "}\ndo\n\n\n" suffix = "done" # Handle before_script before_script = '' if 'before_script' in config: before_script = config['before_script'] # Handle after_script after_script = '' if 'after_script' in config: after_script = config['after_script'] # Join body body = before_script + "\n\n" + prefix + \ "\n".join(env_vars) + "\n\n\n" + body + \ "\n\n\n" + suffix + "\n\n" + after_script # Schedule job script if executer == 'bash': config['jobid'] = job.run(body, _cmd='bash', name_addition=name_addition) else: config['jobid'] = job.run(body, name_addition=name_addition)
params_to_vary = { 'seed': range(3, 9), 'lr': [0.5, 1.0], 'optimizer': ['sgd', 'adam'], 'use_num_hidden': [1, 2, 3, 4, 10], 'hidden_size': [256], 'dset': ['mnist', 'cifar10'], 'freeze': ['progress_first', 'progress_last'] 'save_reduce': [False], 'shuffle_labels': [False], 'first_layer_lr_mult': [1] } ''' # run s = Slurm("vision_standard", {"partition": partition, "time": "4-0"}) ks = sorted(params_to_vary.keys()) vals = [params_to_vary[k] for k in ks] param_combinations = list(itertools.product(*vals)) # list of tuples print(param_combinations) # for param_delete in params_to_delete: # param_combinations.remove(param_delete) # iterate for i in range(len(param_combinations)): param_str = 'module load python; python3 ../vision_fit/fit.py ' for j, key in enumerate(ks): param_str += key + ' ' + str(param_combinations[i][j]) + ' ' s.run(param_str)
import itertools from slurmpy import Slurm import pmlb as dsets partition = 'high' # sweep different ways to initialize weights from dset_names import dset_names dset_nums = range(0, 94) # len 94 # class_weights = [2, 5, 10, 100] class_weights = [2] # run s = Slurm("pmlb", {"partition": partition, "time": "1-0", "mem": "MaxMemPerNode"}) # iterate for class_weight in class_weights: for i in dset_nums: param_str = 'module load python; python3 /accounts/projects/vision/chandan/class-weight-uncertainty/experiments/sweep_pmlb/fit.py ' param_str += 'dset_name ' + str(dset_names[i]) + ' ' param_str += 'class_weight ' + str(class_weight) s.run(param_str)
cmd += [out_dir, str(cdn["iterations"]), "--benchmark", "--delay", str(cdn["delay"])] else: out_dir = op.join(lustre, 'results', out_dir + cdn_ident) cmd += [out_dir, str(cdn["iterations"]), "--benchmark"] if cdn["filesystem"] != "mem": work_dir = op.join(filesystems[cdn["filesystem"]], 'work', work_dir + cdn_ident) cmd += ["--cli", "--work_dir", work_dir] s = Slurm("incrementation", slurm_conf) if cdn["framework"] == "spark": cmd = " ".join(cmd) cmd = "\"{}\"".format(cmd) print("Submitting command: ", cmd) s.run("bash " + spark_template, cmd_kwargs={"spscript": cmd, "parallelism": slurm_conf["cpus-per-task"]}, _cmd=sys.argv[2]) else: with open(legends[cdn["dataset"]]) as legend: images = legend.read().split() num_images = len(images) pn_images = num_images/num_nodes
'num_iters': [40], 'seed': range(0, 1), 'dset': ['mnist', 'cifar10'], # mnist, cifar10 'num_layers': [4], # add in 2, 7 'batch_size': [100], # 10, 100, 1000 'shuffle_labels': [False], # loop 'hidden_size': [128], # 128, 512 'freeze': ['False'], 'first_layer_lr_mult': [1], 'save_all_freq': [20], 'save_reduce': [False], 'saves_per_iter': [2], } ''' # run s = Slurm("proto", {"partition": partition, "time": "3-0"}) ks = sorted(params_to_vary.keys()) vals = [params_to_vary[k] for k in ks] param_combinations = list(itertools.product(*vals)) # list of tuples print(param_combinations) # for param_delete in params_to_delete: # param_combinations.remove(param_delete) # iterate for i in range(len(param_combinations)): param_str = 'module load python; module load pytorch; python ../vision_fit/fit.py ' for j, key in enumerate(ks): param_str += key + ' ' + str(param_combinations[i][j]) + ' ' s.run(param_str)
from slurmpy import Slurm # paths and constant values nproc_old = 441 # number of processors used in bin files old_mesh_dir = "/scratch/05880/tg851791/asdf_sync/model_generating/tao_1d_ref_knl/DATABASES_MPI" # the mesh files old_model_dir = "/scratch/05880/tg851791/work/generate_hybrid_v703/gll_work/model/s362ani_good_min_tao_smooth" # the model files model_tags = "vpv,vph,vsv,vsh,eta,qmu,rho" # vlues to generate # output directory output_file = "/scratch/05880/tg851791/work/generate_hybrid_v703/gll_work/ppm/notopo/3d" # region as lon1/lat1/lon2/lat2/dep1/dep2 (eg: if lon1=30, lon2=20, get points like 30, 29, ...) region = "74/0/175/62/0/800" npts = "249/405/321" # number of poins, including the edge points. # use 18*18 cores, can be set anyway you like. (two directions, divide subregions) nproc = "18/18" command = "date;" # check if ../../specfem_gll.jl/src/program/get_ppm_model.jl is actually the path of get_ppm_model.jl command += f"ibrun julia ../../specfem_gll.jl/src/program/get_ppm_model.jl --nproc_old {nproc_old} --old_mesh_dir {old_mesh_dir} --old_model_dir {old_model_dir} --model_tags {model_tags} --output_file {output_file} --region {region} --npts {npts} --nproc {nproc};" command += "date;" # run 2h18min for my region, 60d*60d, 336*336NEX 21*21 proc. It's safe to set a longer time. s = Slurm("ppm", {"partition": "normal", "nodes": 5, "ntasks": 324, "time": "04:00:00", "account": "TG-EAR130011"}) s.run(command)
with open(args.config) as f: config = json.load(f) check_config(config, args) samples = read_samplesheet(args.sample, args.project) if args.mkref: jobids = [] for sample in samples: fastq = samples[sample] run_bwa = align_and_convert(config, fastq, args, sample) bwa = Slurm("bwaAln-{}".format(sample), { "account": config["slurm"]["account"], "partition": "node", "time": config["slurm"]["time"] }, log_dir="{}/logs".format(args.out), scripts_dir="{}/scripts".format(args.out)) jobids.append(bwa.run(run_bwa)) wcxmkref = Slurm("wcxmkref", { "account": config["slurm"]["account"], "partition": "node", "time": config["slurm"]["time"] }, log_dir="{}/logs".format(args.out), scripts_dir="{}/scripts".format(args.out)) wcxmkref.run(mkref(config, args), depends_on=jobids) elif args.mkmodel:
import itertools from slurmpy import Slurm params_to_vary = { 'seed': range(60), 'hidden1': [1, 2, 3, 5, 10, 30, 50, 100], 'init': ['default', 'data-driven'] } # run s = Slurm("small_nn_run", {"partition": "low"}) ks = sorted(params_to_vary.keys()) vals = [params_to_vary[k] for k in ks] param_combinations = list(itertools.product(*vals)) # list of tuples # iterate for i in range(len(param_combinations)): param_str = 'module load python; module load pytorch; python3 ../fit.py ' for j, key in enumerate(ks): param_str += key + ' ' + str(param_combinations[i][j]) + ' ' s.run(param_str)
def main(): parser = argparse.ArgumentParser(description='Pilot-Agent scheduling for SLURM') parser.add_argument('template', type=str, help="SLURM batch script template") parser.add_argument('params', type=argparse.FileType('r'), help="SLURM batch script params (JSON)") parser.add_argument('-y', '--yarn', action='store_true', help="Yarn scheduler will be used") parser.add_argument('-D', '--no_submit', action='store_true', help="Create but do not submit sbatch scripts" ) args = parser.parse_args() conf = None with args.params as f: conf = json.load(f) if args.yarn and 'COMPUTE' in os.environ: open(op.join(os.environ['HADOOP_HOME'], 'etc/hadoop/slaves'), 'w').close() elif args.yarn: open(op.join(conf["COMPUTE"]["HADOOP_HOME"], 'etc/hadoop/slaves'), 'w').close() submit_func = "bash" if args.no_submit else "sbatch" s = Slurm(conf["name"], conf["SLURM_CONF_GLOBAL"]) program_start = datetime.now().strftime("%Y-%m-%d") rand_hash = '{0}-{1}'.format(hashlib.sha1(args.template.encode("utf-8")).hexdigest(), hashlib.md5(os.urandom(16)).hexdigest()) job_id = rand_hash if args.no_submit else '${SLURM_JOB_ID}' if not "COMPUTE" in conf: conf["COMPUTE"] = {} if not "mstr_bench" in conf["COMPUTE"]: conf["COMPUTE"]["mstr_bench"] = op.join(conf["logdir"], "master-{0}-benchmarks.{1}.out".format(program_start, job_id)) if not "mstr_log" in conf["COMPUTE"]: conf["COMPUTE"]["mstr_log"] = op.join(conf["logdir"], "master-{0}-{1}.out".format(program_start, rand_hash)) conf["COMPUTE"]["mstr_lock"] = op.join(conf["logdir"], "master-{0}-{1}.lock".format(program_start, rand_hash)) conf["COMPUTE"]["logdir"] = conf["logdir"] # if you want to run one master and worker locally, might as well submit to local rm_nnodes = 1 if args.no_submit else 0 for i in range(conf["num_nodes"] - rm_nnodes): # SLURM batch submit workers if args.no_submit: thread = threading.Thread(target=s.run, kwargs=dict(command=args.template, cmd_kwargs=conf["COMPUTE"], _cmd=submit_func)) thread.daemon = True thread.start() else: s.run(args.template, name_addition=rand_hash, cmd_kwargs=conf["COMPUTE"], _cmd=submit_func) while conf["num_nodes"] - rm_nnodes > 0 and not op.isfile(conf["COMPUTE"]["mstr_log"]): time.sleep(5) if conf["num_nodes"] - rm_nnodes > 0: master_url = "" with open(conf["COMPUTE"]["mstr_log"], 'r') as f: master_url = f.readline().strip('\n') program = None driver_out = op.join(conf["logdir"], "driver-{0}-{1}.out".format(program_start, rand_hash)) if not args.no_submit: fw = open(driver_out, "wb") fr = open(driver_out, "r") p = Popen(conf["DRIVER"]["slurm_alloc"], stdin = PIPE, stdout = fw, stderr = fw, bufsize = 1) for module in conf["DRIVER"]["modules"]: p.stdin.write("module load {}\n".format(module).encode('utf-8')) p.stdin.write("echo start $(date +%s.%N)\n".encode('utf-8')) program = ("spark-submit --master {0} --executor-cores=${{SLURM_CPUS_PER_TASK}} " "--executor-memory=${{SLURM_MEM_PER_NODE}}M --driver-memory=${{SLURM_MEM_PER_NODE}}M {1}\n") \ .format(master_url, conf["DRIVER"]["program"]) p.stdin.write(program.encode('utf-8')) out = fr.read() p.stdin.write("echo end $(date +%s.%N)\n".encode('utf-8')) p.stdin.write("echo 'SUCCEEDED' >> {}".format(conf["COMPUTE"]["mstr_log"]).encode('utf-8')) fw.close() fr.close() elif conf["num_nodes"] == 1: program = ("spark-submit --master local[*] {}\n").format(conf["DRIVER"]["program"]) p = Popen(program.split(), stdout = PIPE, stderr = PIPE) stdin, stderr = p.communicate() print(stdin, stderr) else: program = ("spark-submit --master {0} {1}\n").format(master_url, conf["DRIVER"]["program"]) p = Popen(program.split(), stdout = PIPE, stderr = PIPE) stdin, stderr = p.communicate() print(stdin, stderr)
cmd += [ out_dir, str(cdn["iterations"]), "--benchmark", "--delay", str(cdn["delay"]) ] else: out_dir = op.join(lustre, 'results', out_dir + cdn_ident) cmd += [out_dir, str(cdn["iterations"]), "--benchmark"] if cdn["filesystem"] != "mem": work_dir = op.join(filesystems[cdn["filesystem"]], 'work', work_dir + cdn_ident) cmd += ["--cli", "--work_dir", work_dir] s = Slurm("incrementation", slurm_conf) if cdn["framework"] == "spark": cmd = " ".join(cmd) cmd = "\"{}\"".format(cmd) print("Submitting command: ", cmd) s.run("bash " + spark_template, cmd_kwargs={"spscript": cmd}, _cmd=sys.argv[2]) else: with open(legends[cdn["dataset"]]) as legend: images = legend.read().split() num_images = len(images) pn_images = num_images / num_nodes
# sweep these 'num_layers': [1, 2], # 1, 2, 3 'N': [200], 'd': [2], #, 8, 50, 128, 190, 200, 210, 400], 'hidden_size': [64, 128], # 12, 64 'seed': range(0, 30), # for understanding correlated vars, need this ~1000 'opt': ['adam'], 'lr': [5e-3], 'num_iters': [int(5e5)], 'use_bias': [False], 'eps': [0.1], } # run s = Slurm("interactions", {"partition": partition, "time": "1-0"}) ks = sorted(params_to_vary.keys()) vals = [params_to_vary[k] for k in ks] param_combinations = list(itertools.product(*vals)) # list of tuples print(param_combinations) # for param_delete in params_to_delete: # param_combinations.remove(param_delete) # iterate for i in range(len(param_combinations)): param_str = 'module load python; module load pytorch; python3 ../poly_fit/fit.py ' for j, key in enumerate(ks): param_str += key + ' ' + str(param_combinations[i][j]) + ' ' s.run(param_str)
import itertools from slurmpy import Slurm partition = 'gpu' # run (change bottom line for max_corrs or margins!) s = Slurm("cnn_feats", { "partition": partition, "time": "2-0", "gres": "gpu:1" }) # models = ['vgg16', 'vgg19', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'densenet121', 'densenet201']# ['vgg11', 'vgg13', 'resnet18', 'densenet169'] # ['alexnet', 'vgg13'] #, # just the basics models = [ 'alexnet', 'vgg11', 'vgg13', 'resnet18', 'resnet34', 'densenet121', 'densenet169' ] # all nets # ['alexnet', 'vgg11', 'vgg13', 'vgg16', 'vgg19', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'densenet121', 'densenet169', 'densenet201'] # 'alexnet', # 'vgg11', 'vgg13', 'vgg16', 'vgg19', # 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', # 'densenet169', 'densenet201', # densenet121 # 'inception_v3'] # iterate for i, model in enumerate(models): # param_str = 'module load python; python3 ../vision_analyze/max_corr_cnns.py ' # param_str = 'module load python; python3 ../vision_analyze/cnns/save_imagenet_preds.py '
def wrapper(*args, **kwargs): ctx = click.get_current_context() ctx.grand_parent = ctx.parent.parent if not ctx.grand_parent.params["sbatch"]: return ctx.invoke(func, *args, **kwargs) run = ctx.grand_parent.params["run"] if run: click.secho("SBATCH MODE! Submitting to SLURM queue.", fg="green") directory = ctx.parent.params.get("directory") if not directory: raise EmmetCliError( f"{ctx.parent.command_path} needs --directory option!") track_dir = os.path.join(directory, ".emmet") if run and not os.path.exists(track_dir): os.mkdir(track_dir) logger.debug(f"{track_dir} created") bb = ctx.grand_parent.params["bb"] yes = ctx.grand_parent.params["yes"] if bb: if not yes: click.confirm("Did you run `module unload esslurm`?", abort=True) subdir = directory.rsplit(os.sep, 1)[1] stage_in = f"#DW stage_in source={directory} " stage_in += f"destination=$DW_JOB_STRIPED/{subdir} type=directory" script = [ "#DW jobdw capacity=10TB access_mode=striped type=scratch", stage_in, "srun hostname", "", ] command = "\n".join(script) slurm_kwargs = { "qos": "premium", "nodes": 1, "tasks-per-node": 1, "constraint": "haswell", "time": "48:00:00", } else: if not yes: click.confirm("Did you run `module load esslurm`?", abort=True) slurm_kwargs = { "qos": "xfer", "time": "48:00:00", "licenses": "SCRATCH", "mem": "30GB", } command = "" s = Slurm( ctx.command_path.replace(" ", "-"), slurm_kwargs=slurm_kwargs, date_in_name=False, scripts_dir=track_dir, log_dir=track_dir, bash_strict=False, ) command += reconstruct_command(sbatch=True) slurmpy_stderr = io.StringIO() with contextlib.redirect_stderr(slurmpy_stderr): s.run(command, _cmd="sbatch" if run else "cat", tries=1) # 6 days ret = slurmpy_stderr.getvalue()[2:-1] logger.info("\n" + ret.encode("utf-8").decode("unicode_escape")) # TODO add jobid to SUBMITTED.value return ReturnCodes.SUBMITTED if run else ReturnCodes.SUCCESS
def cluster(tool, invocation, clowdrloc, dataloc, cluster, **kwargs): """cluster Launches a pipeline locally through the Clowdr wrappers. Parameters ---------- tool : str Path to a boutiques descriptor for the tool to be run invocation : str Path to a boutiques invocation for the tool and parameters to be run clowdrloc : str Path for storing Clowdr intermediate files and outputs dataloc : str Path for accessing input data. If local, provide the hostname and optionally a path. If on S3, provide an S3 path. cluster : str Scheduler on the cluster being used. Currently, the only supported mode is slurm. **kwargs : dict Arbitrary keyword arguments. Currently supported arguments: - account : str Account for the cluster scheduler - jobname : str Base-name for the jobs as they will appear in the scheduler - verbose : bool Toggle verbose output printing - dev : bool Toggle dev mode (only runs first execution in the specified set) Additionally, transfers all keyword arguments accepted by both of "controller.metadata.consolidateTask" and "task.processTask" Returns ------- int The exit-code returned by the task being executed """ # TODO: scrub inputs tool = utils.truepath(tool) if kwargs.get("simg"): kwargs["simg"] = utils.truepath(kwargs["simg"]) from slurmpy import Slurm if kwargs.get("verbose"): print("Consolidating metadata...") [tasks, invocs] = metadata.consolidateTask(tool, invocation, clowdrloc, dataloc, **kwargs) if kwargs.get("dev"): tasks = [tasks[0]] # Just launch the first task in dev taskdir = op.dirname(utils.truepath(tasks[0])) try: os.mkdir(taskdir) except FileExistsError: pass os.chdir(taskdir) with open(tool) as fhandle: container = json.load(fhandle).get("container-image") if container: if kwargs.get("verbose"): print("Getting container...") outp = utils.getContainer(taskdir, container, **kwargs) if kwargs.get("verbose"): print(outp) jobname = kwargs.get("jobname") if kwargs.get("jobname") else "clowdrtask" slurm_args = {} if kwargs.get("slurm_args"): for opt in kwargs.get("slurm_args").split(","): k, v = opt.split(":")[0], opt.split(":")[1:] v = ":".join(v) slurm_args[k] = v job = Slurm(jobname, slurm_args) script = "clowdr run {} -c {} --local" if kwargs.get("workdir"): script += " -w {}".format(kwargs["workdir"]) if kwargs.get("volumes"): script += " ".join( [" -v {}".format(vol) for vol in kwargs.get("volumes")]) for task in tasks: job.run(script.format(task, taskdir)) if kwargs.get("verbose"): print(taskdir) return taskdir
import itertools from slurmpy import Slurm partition = 'gpu_yugroup' # sweep lambda_reg params_to_vary = { '--reg1': [0, 1e-1, 5e-1, 1e0, 1e1, 1e2, 1e3], '--reg2': [0], } # run s = Slurm("decode", {"partition": partition, "time": "3-0", "gres": "gpu:1"}) ks = sorted(params_to_vary.keys()) vals = [params_to_vary[k] for k in ks] param_combinations = list(itertools.product(*vals)) # list of tuples print(param_combinations) # for param_delete in params_to_delete: # param_combinations.remove(param_delete) # iterate for i in range(len(param_combinations)): param_str = 'module load python; module load pytorch; python ../train.py ' for j, key in enumerate(ks): param_str += key + ' ' + str(param_combinations[i][j]) + ' ' s.run(param_str)
import itertools from slurmpy import Slurm partition = 'high' # sweep small dsets params_to_vary = { 'alpha': [1, 10], # [0.001, 0.05, 1, 10], 'num_bases': [25, 100, 400], 'class_num': [None], # [0, 1] 'batch_size': [100] } # run s = Slurm("sparse_coding", {"partition": partition, "time": "4-0"}) ks = sorted(params_to_vary.keys()) vals = [params_to_vary[k] for k in ks] param_combinations = list(itertools.product(*vals)) # list of tuples print(param_combinations) # for param_delete in params_to_delete: # param_combinations.remove(param_delete) # iterate for i in range(len(param_combinations)): param_str = 'module load python; python3 run_sparse.py ' for j, key in enumerate(ks): param_str += key + ' ' + str(param_combinations[i][j]) + ' ' s.run(param_str)