# Enable cluster training. cluster = SlurmCluster( hyperparam_optimizer=hyperparams, log_path=hyperparams.log_path, python_cmd='python3', # test_tube_exp_name=hyperparams.test_tube_exp_name ) # Email results if your hpc supports it. cluster.notify_job_status(email='*****@*****.**', on_done=True, on_fail=True) # SLURM Module to load. cluster.load_modules(['python-3', 'anaconda3']) # Add commands to the non-SLURM portion. cluster.add_command('source activate transformers') # Add custom SLURM commands which show up as: # #comment # #SBATCH --cmd=value # ############ # cluster.add_slurm_cmd( # cmd='cpus-per-task', value='1', comment='CPUS per task.') # Set job compute details (this will apply PER set of hyperparameters.) cluster.per_experiment_nb_gpus = 4 cluster.per_experiment_nb_nodes = 2 cluster.gpu_type = '1080ti'
parser.opt_range('--wd', default=1e-5, type=float, tunable=True, low=1e-7, high=1e-4, nb_samples=100, log_base=10) hyperparams = parser.parse_args() # Enable cluster training. cluster = SlurmCluster( hyperparam_optimizer=hyperparams, log_path=hyperparams.log_path, python_cmd='python3', ) # Email results if your hpc supports it. # cluster.notify_job_statusi(email='*****@*****.**', on_done=True, on_fail=True) # SLURM Module to load. cluster.load_modules([ 'daint-gpu', ]) # Add commands to the non-SLURM portion. cluster.add_command('. /apps/daint/UES/6.0.UP04/sandboxes/sarafael/miniconda-ss2020/bin/activate') cluster.add_command('export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK') cluster.add_command('export NCCL_DEBUG=INFO') cluster.add_command('export PYTHONFAULTHANDLER=1') cluster.add_command('export NCCL_IB_HCA=ipogif0') cluster.add_command('export NCCL_IB_CUDA_SUPPORT=1') cluster.add_command('srun nproc') cluster.add_command('srun which python') # Add custom SLURM commands which show up as: