Ejemplo n.º 1
0
def submit_job(thecommand):
    s = Slurm("sync", {
        "nodes": 92,
        "ntasks": 4416,
        "partition": 'skx-normal',
        "time": "06:00:00"
    })
    s.run(thecommand)
Ejemplo n.º 2
0
def submit_sbatch(template, conf):
    logging.warning(hist_fn)

    logging.info('Starting batch submission')

    if "benchmark" in conf:
        write_bench_start(conf["benchmark"])

    submit_func = "sbatch"
    rand_hash = ""  #gen_hash(template)
    job_id = '${SLURM_JOB_ID}'
    program_start = configure(conf, job_id, rand_hash)
    s = Slurm(conf["name"], conf["SLURM_CONF_GLOBAL"])
    conf["DRIVER"]["mstr_bench"] = conf["COMPUTE"]["mstr_bench"]
    logging.info('Command to be executed: %s', conf["DRIVER"]["program"])
    job_id = s.run(template, cmd_kwargs=conf["DRIVER"], _cmd=submit_func)

    job_id = str(job_id)

    logging.info('Batch job ID: %s', job_id)
    condition = True
    time.sleep(5)

    while condition:
        p = Popen(["squeue", "-j", job_id], stdout=PIPE, stderr=PIPE)
        (out, err) = p.communicate()
        out = str(out, 'utf-8')

        logging.debug("Squeue output: %s", out)

        out = out.split(os.linesep)
        out.pop(0)
        queue = [
            l.strip().split(' ')[0] for l in out if l.strip().split(' ') != ''
        ]

        condition = job_id in queue
        if condition:
            logging.info('Job still running, sleeping for 5 mins')
            time.sleep(5 * 60)

    logging.info('Batch Job terminated')
    result = 'UNKNOWN'
    logfile = [
        op.join(d, f) for d, s, lf in os.walk(op.abspath('logs')) for f in lf
        if '{}.err'.format(job_id) in f
    ]

    if len(logfile) > 0:
        logging.info('Driver logfile: %s', logfile[0])
        result = job_status(logfile[0])
    else:
        logging.warning('No logfile generated.')

    if "benchmark" in conf:
        write_bench_end(conf["benchmark"])
        write_bench_result(conf["benchmark"], result)
def submit_job(thecommand):
    s = Slurm(
        "process_sync", {
            "nodes": N_node,
            "ntasks": N_cores,
            "time": "04:00:00",
            "cpus-per-task": 1,
            "mem-per-cpu": "2G"
        })
    s.run(thecommand)
Ejemplo n.º 4
0
def submit_job(thecommand):
    s = Slurm(
        "sync", {
            "nodes": 28,
            "ntasks": 1344,
            "partition": 'skx-normal',
            "time": "06:00:00",
            "account": "TG-EAR130011"
        })
    s.run(thecommand)
Ejemplo n.º 5
0
def submit_job(thecommand):
    s = Slurm(
        "sync", {
            "nodes": N_node,
            "ntasks": ntasks,
            "partition": partition,
            "time": time,
            "account": account
        })
    s.run(thecommand)
Ejemplo n.º 6
0
def submit_job(thecommand):
    s = Slurm(
        "sync", {
            "nodes": 81,
            "ntasks": 3888,
            "partition": 'skx-normal',
            "time": "20:00:00",
            "account": "TG-EAR140030"
        })
    s.run(thecommand)
Ejemplo n.º 7
0
def run_parallel(ks, param_combinations, partition='low'):    
    '''run each parameter combination in parallel (requires slurmpy package)
    '''
    from slurmpy import Slurm
    s = Slurm("fit_mog", {"partition": partition, "time": "4-0"})
    
    for i in range(len(param_combinations)):
        param_str = f'module load python; python3 {opj(DIR_FILE, "sim_gaussian_mixture.py")} '
        for j, key in enumerate(ks):
            param_str += '--' + key + ' ' + str(param_combinations[i][j]) + ' '
        print(f'scheduled: {param_str}\n\t({i}/{len(param_combinations)})')
        s.run(param_str)
Ejemplo n.º 8
0
    def create_job(self,
                   name: str,
                   afterok: list = None,
                   afternotok: list = None) -> Slurm:
        """Create a job for submitting to SLURM"""
        LOG.info("Create a slurm job with name %s", name)

        self.slurm_settings["dependency"] = []
        if afterok:
            self.slurm_settings["dependency"].append("afterok:{}".format(
                ":".join(str(dependency) for dependency in afterok)))
        if afternotok:
            self.slurm_settings["dependency"].append("afternotok:{}".format(
                ":".join(str(dependency) for dependency in afternotok)))

        if self.slurm_settings["dependency"]:
            self.slurm_settings["dependency"] = ",".join(
                self.slurm_settings["dependency"])
        else:
            del self.slurm_settings["dependency"]

        job = Slurm(
            name,
            self.slurm_settings,
            log_dir=str(self.log_dir),
            scripts_dir=str(self.scripts_dir),
        )
        return job
Ejemplo n.º 9
0
def my_Slurm(*args, cfg_update=dict(), **kwargs):
    """Shortcut to slurmpy's class; keep certain default kwargs
    and only update some with kwarg `cfg_update`
    see https://github.com/brentp/slurmpy
    """
    return Slurm(*args,
                 slurm_kwargs=dict(cluster.slurm_cfg, **cfg_update),
                 log_dir=log_dir,
                 scripts_dir=slurm_scripts_dir,
                 **kwargs)
Ejemplo n.º 10
0
 def create_job(self, name: str) -> Slurm:
     """Create a job for submitting to SLURM"""
     LOG.info("Create a slurm job with name %s", name)
     job = Slurm(
         name,
         {
             "account": self.account,
             "time": self.time,
         },
         scripts_dir=str(self.scripts_dir),
         log_dir=str(self.log_dir),
     )
     return job
Ejemplo n.º 11
0
def submit_locally(template, conf):

    submit_func = "bash"
    rand_hash = gen_hash(template)
    job_id = ""

    configure(conf, job_id, rand_hash)
    s = Slurm(conf["name"], conf["SLURM_CONF_GLOBAL"])
    master_url = start_workers(s, conf["num_nodes"], conf["COMPUTE"], template,
                               rand_hash, submit_func)

    program = ["spark-submit", "--master", master_url]

    if "jars" in conf["DRIVER"]:
        program.extend(["--jars", conf["DRIVER"]["jars"]])

    program.append(conf["DRIVER"]["program"])

    p = Popen(program, stdout=PIPE, stderr=PIPE)
    stdin, stderr = p.communicate()
    print(stdin, stderr)
Ejemplo n.º 12
0
from slurmpy import Slurm
import argparse

partition = 'high'

s = Slurm("compare_models", {"partition": partition})

parser = argparse.ArgumentParser()
parser.add_argument('--val_only', action='store_true')
parser.add_argument('--test_only', action='store_true')
parser.add_argument('--ignore_cache', action='store_true')
parser.add_argument('--low_data', action='store_true')
args = parser.parse_args()

models = [
    'random_forest', 'gradient_boosting', 'skope_rules', 'rulefit', 'fplasso',
    'fpskope', 'grl', 'oner', 'brs'
]

extra_args = ''
if args.ignore_cache:
    extra_args += ' --ignore_cache'
if args.low_data:
    extra_args += ' --low_data'

if not args.test_only:

    for model in models:
        s.run(f'python experiments/compare_models.py --model {model} --cv' +
              extra_args)
Ejemplo n.º 13
0
import itertools
from slurmpy import Slurm
import numpy as np

partition = 'high'
kernel_version = True

params_to_vary = {
    'run': list(range(100)),  # should be range(100)
}

# run
s = Slurm("fmri", {"partition": partition, "time": "2-0"})
ks = sorted(params_to_vary.keys())
vals = [params_to_vary[k] for k in ks]
param_combinations = list(itertools.product(*vals))  # list of tuples
print(len(param_combinations))
ks = np.array(ks)

# iterate
for i in range(len(param_combinations)):
    if kernel_version:
        param_str = 'module load python; python3 ../fmri/run_kernel.py '
    else:
        param_str = 'module load python; python3 ../fmri/run.py '
    for j, key in enumerate(ks):
        param_str += key + ' ' + str(param_combinations[i][j]) + ' '
    print(param_str)
    s.run(param_str)
Ejemplo n.º 14
0
from slurmpy import Slurm

# * background+tao/crust1.0 -> background+tao/crust1.0 -> background+tao/crust1.0+min/crust2.0
print("start job 4")
nproc_old = 336
old_mesh_dir = "/work/05880/tg851791/stampede2/specfem/20190115/tao_h_files/DATABASES_MPI"
old_model_dir = "/scratch/05880/tg851791/binfile/perturbation/tao_perturbation_smooth"
nproc_new = 441
new_mesh_dir = "/work/05880/tg851791/stampede2/specfem/20190115/simulation_taoreg/DATABASES_MPI"
new_model_dir = "/scratch/05880/tg851791/binfile/interp/s362ani_addmin-pert"
model_tags = ",".join(["vph", "vpv", "vsh", "vsv", "eta", "qmu", "rho"])
output_dir = "/scratch/05880/tg851791/binfile/interp/s362ani_addmin_addtao-pert"

command4 = f"ibrun julia src/program/xsem_interp_mesh2.jl --nproc_old {nproc_old} --old_mesh_dir {old_mesh_dir} --old_model_dir {old_model_dir} --nproc_new {nproc_new} --new_mesh_dir {new_mesh_dir} --new_model_dir {new_model_dir} --model_tags {model_tags} --output_dir {output_dir}"
s4 = Slurm("bg+tao", {
    "partition": "skx-normal",
    "nodes": 10,
    "ntasks": 441,
    "time": "00:60:00"
})
jobid_s4 = s4.run(command4)
Ejemplo n.º 15
0
def schedule(config: dict, name_addition: str = None):
    """
    Schedules a given configuration as a new job

    Args:
        config (dict): job configuration
        name_addition (str, optional): Defaults to None. Addition to the job name

    Raises:
        RuntimeError: When requested scheduler is not available
    """

    executer = config['scheduler']['type'].lower()
    if executer not in ['slurm', 'bash']:
        logger.error("Only SLURM or bash are supported at the moment!")
        raise RuntimeError("Unsupported Job Manager!")

    # If a host entry matches replace the found parameters
    if 'host' in config['scheduler']:
        hostname = socket.getfqdn()
        logger.debug("Hostname: " + hostname)
        if hostname in config['scheduler']['host']:
            logger.debug("Found host entry for this hostname")
            for k, v in config['scheduler']['host'][hostname][
                    'parameters'].items():
                config['scheduler']['parameters'][k] = v

    # Create Slurm job script, allow empty parameters
    try:
        parameters = {
            i: config['scheduler']['parameters'][i]
            for i in config['scheduler']['parameters']
        }
    except KeyError:
        parameters = {}

    # Check if a log directory is set
    log_directory = None
    if 'log-directory' in config['script']:
        log_directory = config['script']['log-directory']

    # Check for job name
    job_name = "ace"
    if 'job-name' in config['scheduler']:
        job_name = config['scheduler']['job-name']

    job = Slurm(job_name, parameters, log_directory=log_directory)

    body = config['script']['body']

    env_vars = []
    auto_args = []

    # Add evn var with job id
    env_vars.append("jobId=" + name_addition)

    for k, v in config['script']['parameters'].items():
        # Check if variable already set
        if k in os.environ:
            logger.warning(k + " environment variable already set!")

        # Set env variable
        if type(v) is dict:
            env_vars.append(k + "=\"" + v['values'] + "\"")
        else:
            env_vars.append(k + "=\"" + v + "\"")
            auto_args.append("--" + k + "=${" + k + "}")

    # Create auto_args
    if 'auto_args' in os.environ:
        logger.warning("auto_args environment variable already set!")
    env_vars.append("")  # Add a new line between args and auto_args
    env_vars.append("auto_args=\"" + " ".join(auto_args) + "\"")

    # Handle times keyword
    prefix = ''
    suffix = ''
    if 'times' in config['script']:
        prefix = "for run in {1.." + config['script']['times'] + "}\ndo\n\n\n"
        suffix = "done"

    # Handle before_script
    before_script = ''
    if 'before_script' in config:
        before_script = config['before_script']

    # Handle after_script
    after_script = ''
    if 'after_script' in config:
        after_script = config['after_script']

    # Join body
    body = before_script + "\n\n" + prefix + \
        "\n".join(env_vars) + "\n\n\n" + body + \
        "\n\n\n" + suffix + "\n\n" + after_script

    # Schedule job script
    if executer == 'bash':
        config['jobid'] = job.run(body,
                                  _cmd='bash',
                                  name_addition=name_addition)
    else:
        config['jobid'] = job.run(body, name_addition=name_addition)
Ejemplo n.º 16
0
params_to_vary = {
    'seed': range(3, 9),
    'lr': [0.5, 1.0],
    'optimizer': ['sgd', 'adam'],
    'use_num_hidden': [1, 2, 3, 4, 10],
    'hidden_size': [256],
    'dset': ['mnist', 'cifar10'], 
    'freeze': ['progress_first', 'progress_last']
    'save_reduce': [False],
    'shuffle_labels': [False],
    'first_layer_lr_mult': [1]

}
'''

# run
s = Slurm("vision_standard", {"partition": partition, "time": "4-0"})
ks = sorted(params_to_vary.keys())
vals = [params_to_vary[k] for k in ks]
param_combinations = list(itertools.product(*vals))  # list of tuples
print(param_combinations)
# for param_delete in params_to_delete:
#     param_combinations.remove(param_delete)

# iterate
for i in range(len(param_combinations)):
    param_str = 'module load python; python3 ../vision_fit/fit.py '
    for j, key in enumerate(ks):
        param_str += key + ' ' + str(param_combinations[i][j]) + ' '
    s.run(param_str)
import itertools
from slurmpy import Slurm
import pmlb as dsets

partition = 'high'

# sweep different ways to initialize weights
from dset_names import dset_names
dset_nums = range(0, 94) # len 94
# class_weights = [2, 5, 10, 100]
class_weights = [2]




# run
s = Slurm("pmlb", {"partition": partition, "time": "1-0", "mem": "MaxMemPerNode"})

# iterate
for class_weight in class_weights:
    for i in dset_nums:
        param_str = 'module load python; python3 /accounts/projects/vision/chandan/class-weight-uncertainty/experiments/sweep_pmlb/fit.py '
        param_str += 'dset_name ' + str(dset_names[i]) + ' '
        param_str += 'class_weight ' + str(class_weight)
        s.run(param_str)
Ejemplo n.º 18
0
        cmd += [out_dir, str(cdn["iterations"]), "--benchmark",
                "--delay", str(cdn["delay"])]
    else:
        out_dir = op.join(lustre, 'results', out_dir + cdn_ident)
        cmd += [out_dir, str(cdn["iterations"]), "--benchmark"]


    if cdn["filesystem"] != "mem":
        work_dir = op.join(filesystems[cdn["filesystem"]],
                           'work',
                           work_dir + cdn_ident)

        cmd += ["--cli", "--work_dir", work_dir]


    s = Slurm("incrementation", slurm_conf)

    if cdn["framework"] == "spark":
        cmd = " ".join(cmd)
        cmd = "\"{}\"".format(cmd)

        print("Submitting command: ", cmd)

        s.run("bash " + spark_template, cmd_kwargs={"spscript": cmd,
                                                    "parallelism": slurm_conf["cpus-per-task"]},
              _cmd=sys.argv[2])
    else:
        with open(legends[cdn["dataset"]]) as legend:
            images = legend.read().split()
            num_images = len(images)
            pn_images = num_images/num_nodes
Ejemplo n.º 19
0
    'num_iters': [40],    
    'seed': range(0, 1),
    'dset': ['mnist', 'cifar10'], # mnist, cifar10  
    'num_layers': [4], # add in 2, 7
    'batch_size': [100], # 10, 100, 1000
    'shuffle_labels': [False], # loop
    'hidden_size': [128], # 128, 512
    'freeze': ['False'],
    'first_layer_lr_mult': [1],
    'save_all_freq': [20],
    'save_reduce': [False],
    'saves_per_iter': [2],
}
'''

# run
s = Slurm("proto", {"partition": partition, "time": "3-0"})
ks = sorted(params_to_vary.keys())
vals = [params_to_vary[k] for k in ks]
param_combinations = list(itertools.product(*vals)) # list of tuples
print(param_combinations)
# for param_delete in params_to_delete:
#     param_combinations.remove(param_delete)

# iterate
for i in range(len(param_combinations)):
    param_str = 'module load python; module load pytorch; python ../vision_fit/fit.py '
    for j, key in enumerate(ks):
        param_str += key + ' ' + str(param_combinations[i][j]) + ' '
    s.run(param_str)
Ejemplo n.º 20
0
from slurmpy import Slurm

# paths and constant values
nproc_old = 441  # number of processors used in bin files
old_mesh_dir = "/scratch/05880/tg851791/asdf_sync/model_generating/tao_1d_ref_knl/DATABASES_MPI"  # the mesh files
old_model_dir = "/scratch/05880/tg851791/work/generate_hybrid_v703/gll_work/model/s362ani_good_min_tao_smooth"  # the model files
model_tags = "vpv,vph,vsv,vsh,eta,qmu,rho"  # vlues to generate
# output directory
output_file = "/scratch/05880/tg851791/work/generate_hybrid_v703/gll_work/ppm/notopo/3d"
# region as lon1/lat1/lon2/lat2/dep1/dep2 (eg: if lon1=30, lon2=20, get points like 30, 29, ...)
region = "74/0/175/62/0/800"
npts = "249/405/321"  # number of poins, including the edge points.
# use 18*18 cores, can be set anyway you like. (two directions, divide subregions)
nproc = "18/18"

command = "date;"
# check if ../../specfem_gll.jl/src/program/get_ppm_model.jl is actually the path of get_ppm_model.jl
command += f"ibrun julia ../../specfem_gll.jl/src/program/get_ppm_model.jl --nproc_old {nproc_old} --old_mesh_dir {old_mesh_dir} --old_model_dir {old_model_dir} --model_tags {model_tags} --output_file {output_file} --region {region} --npts {npts} --nproc {nproc};"
command += "date;"

# run 2h18min for my region, 60d*60d, 336*336NEX 21*21 proc. It's safe to set a longer time.
s = Slurm("ppm", {"partition": "normal",
                  "nodes": 5, "ntasks": 324, "time": "04:00:00", "account": "TG-EAR130011"})

s.run(command)
Ejemplo n.º 21
0
with open(args.config) as f:
    config = json.load(f)

check_config(config, args)

samples = read_samplesheet(args.sample, args.project)

if args.mkref:
    jobids = []
    for sample in samples:
        fastq = samples[sample]
        run_bwa = align_and_convert(config, fastq, args, sample)
        bwa = Slurm("bwaAln-{}".format(sample), {
            "account": config["slurm"]["account"],
            "partition": "node",
            "time": config["slurm"]["time"]
        },
                    log_dir="{}/logs".format(args.out),
                    scripts_dir="{}/scripts".format(args.out))
        jobids.append(bwa.run(run_bwa))

    wcxmkref = Slurm("wcxmkref", {
        "account": config["slurm"]["account"],
        "partition": "node",
        "time": config["slurm"]["time"]
    },
                     log_dir="{}/logs".format(args.out),
                     scripts_dir="{}/scripts".format(args.out))
    wcxmkref.run(mkref(config, args), depends_on=jobids)

elif args.mkmodel:
Ejemplo n.º 22
0
import itertools
from slurmpy import Slurm



params_to_vary = {
    'seed': range(60),
    'hidden1': [1, 2, 3, 5, 10, 30, 50, 100],
    'init': ['default', 'data-driven']
}


# run
s = Slurm("small_nn_run", {"partition": "low"})
ks = sorted(params_to_vary.keys())
vals = [params_to_vary[k] for k in ks]
param_combinations = list(itertools.product(*vals)) # list of tuples

# iterate
for i in range(len(param_combinations)):
    param_str = 'module load python; module load pytorch; python3 ../fit.py '
    for j, key in enumerate(ks):
        param_str += key + ' ' + str(param_combinations[i][j]) + ' '
    s.run(param_str)
Ejemplo n.º 23
0
def main():

    parser = argparse.ArgumentParser(description='Pilot-Agent scheduling for SLURM')
    parser.add_argument('template', type=str, help="SLURM batch script template")
    parser.add_argument('params', type=argparse.FileType('r'), help="SLURM batch script params (JSON)")
    parser.add_argument('-y', '--yarn', action='store_true', help="Yarn scheduler will be used")
    parser.add_argument('-D', '--no_submit', action='store_true', help="Create but do not submit sbatch scripts" )
    args = parser.parse_args()

    conf = None
    with args.params as f:
        conf = json.load(f)

    if args.yarn and 'COMPUTE' in os.environ:
        open(op.join(os.environ['HADOOP_HOME'], 'etc/hadoop/slaves'), 'w').close()
    elif args.yarn:
        open(op.join(conf["COMPUTE"]["HADOOP_HOME"], 'etc/hadoop/slaves'), 'w').close()

    submit_func = "bash" if args.no_submit else "sbatch"

    s = Slurm(conf["name"], conf["SLURM_CONF_GLOBAL"])

    program_start = datetime.now().strftime("%Y-%m-%d")

    rand_hash = '{0}-{1}'.format(hashlib.sha1(args.template.encode("utf-8")).hexdigest(), hashlib.md5(os.urandom(16)).hexdigest())
    job_id = rand_hash if args.no_submit else '${SLURM_JOB_ID}' 

    if not "COMPUTE" in conf:
        conf["COMPUTE"] = {}

    if not "mstr_bench" in conf["COMPUTE"]:
        conf["COMPUTE"]["mstr_bench"] = op.join(conf["logdir"], "master-{0}-benchmarks.{1}.out".format(program_start, job_id))

    if not "mstr_log" in conf["COMPUTE"]:
        conf["COMPUTE"]["mstr_log"] = op.join(conf["logdir"], "master-{0}-{1}.out".format(program_start, rand_hash))

    conf["COMPUTE"]["mstr_lock"] = op.join(conf["logdir"], "master-{0}-{1}.lock".format(program_start, rand_hash))
    
    conf["COMPUTE"]["logdir"] = conf["logdir"]   

    # if you want to run one master and worker locally, might as well submit to local
    rm_nnodes = 1 if args.no_submit else 0

    for i in range(conf["num_nodes"] - rm_nnodes):
            
        # SLURM batch submit workers
        if args.no_submit:
            thread = threading.Thread(target=s.run, kwargs=dict(command=args.template, cmd_kwargs=conf["COMPUTE"], _cmd=submit_func))
            thread.daemon = True
            thread.start()
        else:
            s.run(args.template, name_addition=rand_hash, cmd_kwargs=conf["COMPUTE"], _cmd=submit_func)
        
    while conf["num_nodes"] - rm_nnodes > 0 and not op.isfile(conf["COMPUTE"]["mstr_log"]):
        time.sleep(5)

    if conf["num_nodes"] - rm_nnodes > 0:
        master_url = ""

        with open(conf["COMPUTE"]["mstr_log"], 'r') as f:
            master_url = f.readline().strip('\n')



    program = None
    driver_out = op.join(conf["logdir"], "driver-{0}-{1}.out".format(program_start, rand_hash))

    if not args.no_submit:
        fw = open(driver_out, "wb")
        fr = open(driver_out, "r")
        p = Popen(conf["DRIVER"]["slurm_alloc"], stdin = PIPE, stdout = fw, stderr = fw, bufsize = 1)
        for module in conf["DRIVER"]["modules"]:
            p.stdin.write("module load {}\n".format(module).encode('utf-8'))
        
        p.stdin.write("echo start $(date +%s.%N)\n".encode('utf-8'))
        program = ("spark-submit --master {0} --executor-cores=${{SLURM_CPUS_PER_TASK}} "
                    "--executor-memory=${{SLURM_MEM_PER_NODE}}M  --driver-memory=${{SLURM_MEM_PER_NODE}}M {1}\n") \
                            .format(master_url, conf["DRIVER"]["program"])
        p.stdin.write(program.encode('utf-8'))
        
        out = fr.read()

        p.stdin.write("echo end $(date +%s.%N)\n".encode('utf-8'))
        p.stdin.write("echo 'SUCCEEDED' >> {}".format(conf["COMPUTE"]["mstr_log"]).encode('utf-8'))
        fw.close()
        fr.close()
    elif conf["num_nodes"] == 1:
        program = ("spark-submit --master local[*] {}\n").format(conf["DRIVER"]["program"])
        p = Popen(program.split(), stdout = PIPE, stderr = PIPE)
        stdin, stderr = p.communicate()
        print(stdin, stderr)
    else:
        program = ("spark-submit --master {0} {1}\n").format(master_url, conf["DRIVER"]["program"])
        p = Popen(program.split(), stdout = PIPE, stderr = PIPE)
        stdin, stderr = p.communicate()
        print(stdin, stderr)
Ejemplo n.º 24
0
        cmd += [
            out_dir,
            str(cdn["iterations"]), "--benchmark", "--delay",
            str(cdn["delay"])
        ]
    else:
        out_dir = op.join(lustre, 'results', out_dir + cdn_ident)
        cmd += [out_dir, str(cdn["iterations"]), "--benchmark"]

    if cdn["filesystem"] != "mem":
        work_dir = op.join(filesystems[cdn["filesystem"]], 'work',
                           work_dir + cdn_ident)

        cmd += ["--cli", "--work_dir", work_dir]

    s = Slurm("incrementation", slurm_conf)

    if cdn["framework"] == "spark":
        cmd = " ".join(cmd)
        cmd = "\"{}\"".format(cmd)

        print("Submitting command: ", cmd)

        s.run("bash " + spark_template,
              cmd_kwargs={"spscript": cmd},
              _cmd=sys.argv[2])
    else:
        with open(legends[cdn["dataset"]]) as legend:
            images = legend.read().split()
            num_images = len(images)
            pn_images = num_images / num_nodes
Ejemplo n.º 25
0
    # sweep these
    'num_layers': [1, 2],  # 1, 2, 3
    'N': [200],
    'd': [2],  #, 8, 50, 128, 190, 200, 210, 400],
    'hidden_size': [64, 128],  # 12, 64
    'seed':
    range(0, 30),  # for understanding correlated vars, need this ~1000
    'opt': ['adam'],
    'lr': [5e-3],
    'num_iters': [int(5e5)],
    'use_bias': [False],
    'eps': [0.1],
}

# run
s = Slurm("interactions", {"partition": partition, "time": "1-0"})
ks = sorted(params_to_vary.keys())
vals = [params_to_vary[k] for k in ks]
param_combinations = list(itertools.product(*vals))  # list of tuples
print(param_combinations)
# for param_delete in params_to_delete:
#     param_combinations.remove(param_delete)

# iterate
for i in range(len(param_combinations)):
    param_str = 'module load python; module load pytorch; python3 ../poly_fit/fit.py '
    for j, key in enumerate(ks):
        param_str += key + ' ' + str(param_combinations[i][j]) + ' '
    s.run(param_str)
Ejemplo n.º 26
0
import itertools
from slurmpy import Slurm

partition = 'gpu'

# run (change bottom line for max_corrs or margins!)
s = Slurm("cnn_feats", {
    "partition": partition,
    "time": "2-0",
    "gres": "gpu:1"
})
# models = ['vgg16', 'vgg19', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'densenet121', 'densenet201']# ['vgg11', 'vgg13', 'resnet18', 'densenet169'] # ['alexnet', 'vgg13'] #,

# just the basics
models = [
    'alexnet', 'vgg11', 'vgg13', 'resnet18', 'resnet34', 'densenet121',
    'densenet169'
]

# all nets
# ['alexnet', 'vgg11', 'vgg13', 'vgg16', 'vgg19', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'densenet121', 'densenet169', 'densenet201']
#     'alexnet',
#           'vgg11', 'vgg13', 'vgg16', 'vgg19',
#           'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152',
#           'densenet169', 'densenet201', # densenet121
#           'inception_v3']

# iterate
for i, model in enumerate(models):
    #     param_str = 'module load python; python3 ../vision_analyze/max_corr_cnns.py '
    #     param_str = 'module load python; python3 ../vision_analyze/cnns/save_imagenet_preds.py '
Ejemplo n.º 27
0
    def wrapper(*args, **kwargs):
        ctx = click.get_current_context()
        ctx.grand_parent = ctx.parent.parent
        if not ctx.grand_parent.params["sbatch"]:
            return ctx.invoke(func, *args, **kwargs)

        run = ctx.grand_parent.params["run"]
        if run:
            click.secho("SBATCH MODE! Submitting to SLURM queue.", fg="green")

        directory = ctx.parent.params.get("directory")
        if not directory:
            raise EmmetCliError(
                f"{ctx.parent.command_path} needs --directory option!")

        track_dir = os.path.join(directory, ".emmet")
        if run and not os.path.exists(track_dir):
            os.mkdir(track_dir)
            logger.debug(f"{track_dir} created")

        bb = ctx.grand_parent.params["bb"]
        yes = ctx.grand_parent.params["yes"]
        if bb:
            if not yes:
                click.confirm("Did you run `module unload esslurm`?",
                              abort=True)
            subdir = directory.rsplit(os.sep, 1)[1]
            stage_in = f"#DW stage_in source={directory} "
            stage_in += f"destination=$DW_JOB_STRIPED/{subdir} type=directory"
            script = [
                "#DW jobdw capacity=10TB access_mode=striped type=scratch",
                stage_in,
                "srun hostname",
                "",
            ]

            command = "\n".join(script)
            slurm_kwargs = {
                "qos": "premium",
                "nodes": 1,
                "tasks-per-node": 1,
                "constraint": "haswell",
                "time": "48:00:00",
            }
        else:
            if not yes:
                click.confirm("Did you run `module load esslurm`?", abort=True)
            slurm_kwargs = {
                "qos": "xfer",
                "time": "48:00:00",
                "licenses": "SCRATCH",
                "mem": "30GB",
            }
            command = ""

        s = Slurm(
            ctx.command_path.replace(" ", "-"),
            slurm_kwargs=slurm_kwargs,
            date_in_name=False,
            scripts_dir=track_dir,
            log_dir=track_dir,
            bash_strict=False,
        )

        command += reconstruct_command(sbatch=True)
        slurmpy_stderr = io.StringIO()
        with contextlib.redirect_stderr(slurmpy_stderr):
            s.run(command, _cmd="sbatch" if run else "cat", tries=1)  # 6 days
        ret = slurmpy_stderr.getvalue()[2:-1]
        logger.info("\n" + ret.encode("utf-8").decode("unicode_escape"))
        # TODO add jobid to SUBMITTED.value
        return ReturnCodes.SUBMITTED if run else ReturnCodes.SUCCESS
Ejemplo n.º 28
0
def cluster(tool, invocation, clowdrloc, dataloc, cluster, **kwargs):
    """cluster
    Launches a pipeline locally through the Clowdr wrappers.

    Parameters
    ----------
    tool : str
        Path to a boutiques descriptor for the tool to be run
    invocation : str
        Path to a boutiques invocation for the tool and parameters to be run
    clowdrloc : str
        Path for storing Clowdr intermediate files and outputs
    dataloc : str
        Path for accessing input data. If local, provide the hostname and
        optionally a path. If on S3, provide an S3 path.
    cluster : str
        Scheduler on the cluster being used. Currently, the only supported mode
        is slurm.
    **kwargs : dict
        Arbitrary keyword arguments. Currently supported arguments:
        - account : str
            Account for the cluster scheduler
        - jobname : str
            Base-name for the jobs as they will appear in the scheduler
        - verbose : bool
            Toggle verbose output printing
        - dev : bool
            Toggle dev mode (only runs first execution in the specified set)

        Additionally, transfers all keyword arguments accepted by both of
        "controller.metadata.consolidateTask" and "task.processTask"

    Returns
    -------
    int
        The exit-code returned by the task being executed
    """
    # TODO: scrub inputs
    tool = utils.truepath(tool)
    if kwargs.get("simg"):
        kwargs["simg"] = utils.truepath(kwargs["simg"])

    from slurmpy import Slurm

    if kwargs.get("verbose"):
        print("Consolidating metadata...")
    [tasks, invocs] = metadata.consolidateTask(tool, invocation, clowdrloc,
                                               dataloc, **kwargs)
    if kwargs.get("dev"):
        tasks = [tasks[0]]  # Just launch the first task in dev

    taskdir = op.dirname(utils.truepath(tasks[0]))
    try:
        os.mkdir(taskdir)
    except FileExistsError:
        pass
    os.chdir(taskdir)

    with open(tool) as fhandle:
        container = json.load(fhandle).get("container-image")
    if container:
        if kwargs.get("verbose"):
            print("Getting container...")
        outp = utils.getContainer(taskdir, container, **kwargs)
        if kwargs.get("verbose"):
            print(outp)

    jobname = kwargs.get("jobname") if kwargs.get("jobname") else "clowdrtask"
    slurm_args = {}
    if kwargs.get("slurm_args"):
        for opt in kwargs.get("slurm_args").split(","):
            k, v = opt.split(":")[0], opt.split(":")[1:]
            v = ":".join(v)
            slurm_args[k] = v
    job = Slurm(jobname, slurm_args)

    script = "clowdr run {} -c {} --local"
    if kwargs.get("workdir"):
        script += " -w {}".format(kwargs["workdir"])
    if kwargs.get("volumes"):
        script += " ".join(
            [" -v {}".format(vol) for vol in kwargs.get("volumes")])

    for task in tasks:
        job.run(script.format(task, taskdir))

    if kwargs.get("verbose"):
        print(taskdir)
    return taskdir
Ejemplo n.º 29
0
import itertools
from slurmpy import Slurm

partition = 'gpu_yugroup'

# sweep lambda_reg
params_to_vary = {
    '--reg1': [0, 1e-1, 5e-1, 1e0, 1e1, 1e2, 1e3],
    '--reg2': [0],
}


# run
s = Slurm("decode", {"partition": partition, "time": "3-0", "gres": "gpu:1"})
ks = sorted(params_to_vary.keys())
vals = [params_to_vary[k] for k in ks]
param_combinations = list(itertools.product(*vals)) # list of tuples
print(param_combinations)
# for param_delete in params_to_delete:
#     param_combinations.remove(param_delete)

# iterate
for i in range(len(param_combinations)):
    param_str = 'module load python; module load pytorch; python ../train.py '
    for j, key in enumerate(ks):
        param_str += key + ' ' + str(param_combinations[i][j]) + ' '
    s.run(param_str)
Ejemplo n.º 30
0
import itertools
from slurmpy import Slurm

partition = 'high'

# sweep small dsets
params_to_vary = {
    'alpha': [1, 10],  # [0.001, 0.05, 1, 10],
    'num_bases': [25, 100, 400],
    'class_num': [None],  # [0, 1]
    'batch_size': [100]
}

# run
s = Slurm("sparse_coding", {"partition": partition, "time": "4-0"})
ks = sorted(params_to_vary.keys())
vals = [params_to_vary[k] for k in ks]
param_combinations = list(itertools.product(*vals))  # list of tuples
print(param_combinations)
# for param_delete in params_to_delete:
#     param_combinations.remove(param_delete)

# iterate
for i in range(len(param_combinations)):
    param_str = 'module load python; python3 run_sparse.py '
    for j, key in enumerate(ks):
        param_str += key + ' ' + str(param_combinations[i][j]) + ' '
    s.run(param_str)