def register_barrier(): """So we don't have multiple jobs running simulateously""" register_job( user="******", project="sgd", experiment=experiment, job="barrier", priority=priority, n_workers=16, config_overrides={}, runtime_environment={"clone": {"code_package": code_package}, "script": "barrier.py"}, annotations={"description": description}, )
def schedule(name, config, skip_existing=False): # Skip pre-existing entries if (skip_existing and mongo.job.count_documents({ "project": project, "job": name, "experiment": experiment }) > 0): return config = {**base_config, **config} n_workers = config["n_workers"] job_id = register_job( user="******", project=project, experiment=experiment, job=name, n_workers=n_workers, priority=10, config_overrides=config, runtime_environment={ "clone": { "code_package": code_package }, "script": script }, annotations={"description": description}, ) print( f'sbatch --ntasks {n_workers} --job-name="{name}" --gpus-per-task=1 --cpus-per-task=8 --wrap="srun jobrun {job_id} --mpi"' )
def schedule(name, config, skip_existing=True): # Skip pre-existing entries config = {**base_config, **config} if (skip_existing and mongo.job.count_documents( { "project": project, "job": name, "experiment": experiment, "config.learning_rate": config["learning_rate"], }) > 0): return job_id = register_job( user="******", project=project, experiment=experiment, job=name, n_workers=n_workers, priority=10, config_overrides=config, runtime_environment={ "clone": { "code_package": code_package }, "script": script }, annotations={"description": description}, ) ids.append(job_id)
job_id = register_job( user="******", project="sgd", experiment=experiment, job=name, n_workers=n_workers, priority=10, config_overrides={ "seed": seed, "distributed_backend": "nccl", "optimizer_scale_lr_with_factor": n_workers, "num_epochs": 300, "log_verbosity": 1, **shared.sgd_config(learning_rate, momentum=0.9, weight_decay=0.0001), **shared.optimizer_config(reducer), "optimizer_reducer_compression": 1 / compression, }, runtime_environment={ "clone": { "code_package": code_package }, "script": "train.py" }, annotations={"description": description}, )
name = f"{reducer}_{n_workers:02d}workers_lr{learning_rate}" if mongo.job.count_documents({"job": name, "experiment": experiment}) > 0: # We have this one already continue job_id = register_job( user="******", project="sgd", experiment=experiment, job=name, n_workers=n_workers, priority=20, config_overrides={ "seed": seed, "distributed_backend": "nccl", "optimizer_scale_lr_with_factor": n_workers, **shared.language_modeling_base(), **shared.sgd_config(learning_rate, momentum=0.0, weight_decay=0.0), **shared.optimizer_config(reducer), }, runtime_environment={ "clone": { "code_package": code_package }, "script": "train.py" }, annotations={"description": description}, ) print("{} - {}".format(job_id, name)) registered_ids.append(job_id) # kubernetes_schedule_job_queue(
job_id = register_job( user="******", project="sgd", experiment=experiment, job=name, priority=seed + (100 if backend == "nccl" else 0), n_workers=n_workers, config_overrides={ "seed": 10000 + seed, "optimizer_scale_lr_with_factor": n_workers, "distributed_backend": backend, "log_verbosity": log_level, "num_epochs": 10, **shared.sgd_config(0.1, momentum=0.9, weight_decay=0.0001), **shared.optimizer_config(reducer), }, runtime_environment={ "clone": { "code_package": code_package }, "script": "train.py", }, annotations={"description": description}, )
sleep(0.1) for backend in ["nccl", "gloo"]: for n_workers in [2, 4, 8, 16]: name = f"time_{n_workers}workers_{backend}" if mongo.job.count_documents({"job": name, "experiment": experiment}) > 0: # We have this one already continue job_id = register_job( user="******", project="sgd", experiment=experiment, job=name, n_workers=n_workers, priority=priority, config_overrides={ "distributed_backend": backend, "repetitions": 20, "device": "cuda", "n_workers": n_workers, }, runtime_environment={"clone": {"code_package": code_package}, "script": "timings.py"}, annotations={"description": description}, ) print("{} - {}".format(job_id, name)) registered_ids.append(job_id) sleep(0.1) register_barrier() sleep(0.1)
code_package, files_uploaded = upload_code_package(".", excludes=excluded_files) print("Uploaded {} files.".format(len(files_uploaded))) for noise_level in ["0.00", "0.25", "0.50", "1.00"]: cfg = dict( pretrained_noise_level=noise_level, model_path= f"/raw/vogels/locuslab-smoothing-pretrained-models/imagenet/resnet50/noise_{noise_level}/checkpoint.pth.tar", ) job_id = register_job( user="******", project="adversarial-transfer-learning", experiment="does-random-noise-also-help", job=f"noise_{noise_level}", priority=10, config_overrides=cfg, runtime_environment={ "clone": { "code_package": code_package }, "script": "main.py" }, annotations={ "description": "Using pretrained ImageNet models from https://github.com/locuslab/smoothing, we want to see if Gaussian input perturbations have the same effect as adversarial ones." }, ) print(f"jobrun {job_id}")