def optimize_on_cluster(hyperparams): # enable cluster training cluster = SlurmCluster(hyperparam_optimizer=hyperparams, log_path=hyperparams.tt_save_path, test_tube_exp_name=hyperparams.tt_name) # email for cluster coms cluster.notify_job_status(email='add_email_here', on_done=True, on_fail=True) # configure cluster cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus cluster.job_time = '48:00:00' cluster.gpu_type = '1080ti' cluster.memory_mb_per_node = 48000 # any modules for code to run in env cluster.add_command('source activate pytorch_lightning') # name of exp job_display_name = hyperparams.tt_name.split('_')[0] job_display_name = job_display_name[0:3] # run hopt print('submitting jobs...') cluster.optimize_parallel_cluster_gpu(main, nb_trials=hyperparams.nb_hopt_trials, job_name=job_display_name)
def optimize_on_cluster(hyperparams): # enable cluster training # log all scripts to the test tube folder cluster = SlurmCluster( hyperparam_optimizer=hyperparams, log_path=hyperparams.slurm_log_path, ) # email for cluster coms cluster.notify_job_status(email=hyperparams.email, on_done=True, on_fail=True) # configure cluster cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus cluster.per_experiment_nb_nodes = hyperparams.nb_gpu_nodes cluster.job_time = '2:00:00' cluster.gpu_type = 'volta' cluster.memory_mb_per_node = 0 # any modules for code to run in env cluster.add_command(f'source activate {hyperparams.conda_env}') # run only on 32GB voltas cluster.add_slurm_cmd(cmd='constraint', value='volta32gb', comment='use 32gb gpus') cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition, comment='use 32gb gpus') # run hopt # creates and submits jobs to slurm cluster.optimize_parallel_cluster_gpu( main, nb_trials=hyperparams.num_hyperparam_trials, job_name=hyperparams.experiment_name )
def run_on_cluster(hyperparams): # enable cluster training cluster = SlurmCluster(hyperparam_optimizer=hyperparams, log_path=hyperparams.logs_save_path) # email results if your hpc supports it cluster.notify_job_status(email='*****@*****.**', on_done=True, on_fail=True) # any modules for code to run in env cluster.add_command(f'source activate {hyperparams.conda_env}') # pick the gpu resources cluster.per_experiment_nb_gpus = hyperparams.gpus cluster.per_experiment_nb_cpus = 1 cluster.per_experiment_nb_nodes = 1 #cluster.gpu_type = 'k80' cluster.job_time = '48:00:00' cluster.minutes_to_checkpoint_before_walltime = 5 cluster.memory_mb_per_node = 250000 #180000 # come up with a short exp name job_display_name = hyperparams.tt_name.split('_')[0] job_display_name = job_display_name[0:4] # optimize across all gpus print('submitting jobs...') cluster.optimize_parallel_cluster_gpu(main, nb_trials=hyperparams.nb_hopt_trials, job_name=job_display_name)
def optimize_on_cluster(hyperparams): # enable cluster training # log all scripts to the test tube folder cluster = SlurmCluster( hyperparam_optimizer=hyperparams, log_path=hyperparams.slurm_log_path, ) # email for cluster coms cluster.notify_job_status(email=hyperparams.email, on_done=True, on_fail=True) # configure cluster cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus cluster.per_experiment_nb_nodes = hyperparams.nb_gpu_nodes cluster.job_time = '2:00:00' cluster.gpu_type = 'volta' cluster.memory_mb_per_node = 0 # any modules for code to run in env cluster.add_command(f'source activate {hyperparams.conda_env}') # set DDP master port cluster.add_command(f'export MASTER_PORT={PORT}') # OPTIONAL for debugging # without these flags errors in your code will # appear to be nccl errors cluster.add_command('export NCCL_DEBUG=INFO') cluster.add_command('export PYTHONFAULTHANDLER=1') # depending on your cluster config, you probably want # to limit the wired connection device # cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo') # depending on your cluster, you might need to load # the latest NCCL version # cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0']) # run only on 32GB voltas cluster.add_slurm_cmd(cmd='constraint', value='volta32gb', comment='use 32gb gpus') cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition, comment='use 32gb gpus') # run hopt # creates and submits jobs to slurm cluster.optimize_parallel_cluster_gpu( main, nb_trials=hyperparams.num_hyperparam_trials, job_name=hyperparams.experiment_name )
default=12, options=[20, 12, 30, 45], tunable=True) hyperparams = parser.parse_args() # Enable cluster training. cluster = SlurmCluster( hyperparam_optimizer=hyperparams, log_path=hyperparams.log_path, python_cmd='python3', # test_tube_exp_name=hyperparams.test_tube_exp_name ) # Email results if your hpc supports it. cluster.notify_job_status(email='*****@*****.**', on_done=True, on_fail=True) # SLURM Module to load. cluster.load_modules(['python-3', 'anaconda3']) # Add commands to the non-SLURM portion. cluster.add_command('source activate transformers') # Add custom SLURM commands which show up as: # #comment # #SBATCH --cmd=value # ############ # cluster.add_slurm_cmd( # cmd='cpus-per-task', value='1', comment='CPUS per task.')