learning_rate = 1.25 momentum = 0 job_id = register_job( user="******", project="sgd", experiment=experiment, job=name, n_workers=n_workers, priority=20, config_overrides={ "seed": seed, "distributed_backend": "nccl", "optimizer_scale_lr_with_factor": n_workers, "log_verbosity": 1, **shared.language_modeling_base(), **shared.sgd_config(learning_rate, momentum=momentum, weight_decay=0.0), **shared.optimizer_config(reducer), }, runtime_environment={"clone": {"code_package": code_package}, "script": "train.py"}, annotations={"description": description}, ) print("{} - {}".format(job_id, name)) registered_ids.append(job_id) # kubernetes_schedule_job_queue( # registered_ids, # "ic-registry.epfl.ch/mlo/vogels_experiment", # volumes=["pv-mlodata1"], # gpus=gpus, # parallelism=4,
job=name, n_workers=n_workers, priority=10, config_overrides={ "seed": seed, "distributed_backend": "nccl", "optimizer_scale_lr_with_factor": n_workers, "num_epochs": 300, "log_verbosity": 1, **shared.sgd_config(learning_rate, momentum=0.9, weight_decay=0.0001), **shared.optimizer_config(reducer), "optimizer_reducer_compression": 1 / compression, }, runtime_environment={ "clone": { "code_package": code_package }, "script": "train.py" }, annotations={"description": description}, ) print("{} - {}".format(job_id, name)) registered_ids.append(job_id)
job=name, priority=seed + (100 if backend == "nccl" else 0), n_workers=n_workers, config_overrides={ "seed": 10000 + seed, "optimizer_scale_lr_with_factor": n_workers, "distributed_backend": backend, "log_verbosity": log_level, "num_epochs": 10, **shared.sgd_config(0.1, momentum=0.9, weight_decay=0.0001), **shared.optimizer_config(reducer), }, runtime_environment={ "clone": { "code_package": code_package }, "script": "train.py", }, annotations={"description": description}, ) print("{} - {}".format(job_id, name)) registered_ids.append(job_id) # kubernetes_schedule_job_queue(