コード例 #1
0
ファイル: submit_jobs.py プロジェクト: nkasmanoff/dm2gal
def run_on_cluster(hyperparams):
    # enable cluster training
    cluster = SlurmCluster(hyperparam_optimizer=hyperparams,
                           log_path=hyperparams.logs_save_path)

    # email results if your hpc supports it
    cluster.notify_job_status(email='*****@*****.**',
                              on_done=True,
                              on_fail=True)
    # any modules for code to run in env
    cluster.add_command(f'source activate {hyperparams.conda_env}')
    # pick the gpu resources
    cluster.per_experiment_nb_gpus = hyperparams.gpus
    cluster.per_experiment_nb_cpus = 1
    cluster.per_experiment_nb_nodes = 1
    #cluster.gpu_type = 'k80'
    cluster.job_time = '48:00:00'
    cluster.minutes_to_checkpoint_before_walltime = 5
    cluster.memory_mb_per_node = 250000  #180000
    # come up with a short exp name
    job_display_name = hyperparams.tt_name.split('_')[0]
    job_display_name = job_display_name[0:4]
    # optimize across all gpus
    print('submitting jobs...')
    cluster.optimize_parallel_cluster_gpu(main,
                                          nb_trials=hyperparams.nb_hopt_trials,
                                          job_name=job_display_name)
コード例 #2
0
def optimize_on_cluster(hyperparams):
    # enable cluster training
    # log all scripts to the test tube folder
    cluster = SlurmCluster(
        hyperparam_optimizer=hyperparams,
        log_path=hyperparams.slurm_log_path,
    )

    # email for cluster coms
    # cluster.notify_job_status(email='add_email_here', on_done=True, on_fail=True)

    # configure cluster
    cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus
    cluster.per_experiment_nb_nodes = hyperparams.nb_gpu_nodes
    cluster.per_experiment_nb_cpus = hyperparams.per_experiment_nb_cpus
    cluster.job_time = hyperparams.job_time
    cluster.gpu_type = hyperparams.gpu_type
    cluster.memory_mb_per_node = 0

    # any modules for code to run in env
    cluster.add_command("source activate dialog")
    cluster.add_command(
        "export PYTHONPATH=$PYTHONPATH:/private/home/koustuvs/mlp/latentDialogAnalysis"
    )

    # run only on 32GB voltas
    # cluster.add_slurm_cmd(cmd='constraint', value='volta32gb',
    #                     comment='use 32gb gpus')
    cluster.add_slurm_cmd(cmd="partition",
                          value=hyperparams.gpu_partition,
                          comment="use 32gb gpus")

    # run hopt
    # creates and submits jobs to slurm
    cluster.optimize_parallel_cluster_gpu(
        main,
        nb_trials=hyperparams.nb_hopt_trials,
        job_name=hyperparams.id + "_grid_search",
        job_display_name=hyperparams.id,
    )
コード例 #3
0
# enable cluster training
cluster = SlurmCluster(hyperparam_optimizer=hyperparams,
                       log_path=hyperparams.log_path,
                       python_cmd='python3',
                       test_tube_exp_name=hyperparams.test_tube_exp_name)

# email results if your hpc supports it
cluster.notify_job_status(email='*****@*****.**', on_done=True, on_fail=True)

# any modules for code to run in env
cluster.load_modules(['python-3', 'anaconda3'])
# add commands to the non slurm portion
cluster.add_command('source activate myCondaEnv')

# can also add custom slurm commands which show up as:
# #comment
# #SBATCH --cmd=value
# ############
# cluster.add_slurm_cmd(cmd='cpus-per-task', value='1', comment='nb cpus per task')

# set job compute details (this will apply PER set of hyperparameters)
cluster.per_experiment_nb_cpus = 20
cluster.per_experiment_nb_nodes = 10

# each job (24 in total here) will use 200 cpus for each set of hyperparams
# if job_display_name is set, it's what will display in the slurm queue
cluster.optimize_parallel_cluster_cpu(train,
                                      nb_trials=24,
                                      job_name='first_tt_job',
                                      job_display_name='short_name')
コード例 #4
0
def run_cluster(parser, fn_main, lt_system):
    params = parser.parse_args()

    if params.system_mode == "3d" and "3d" not in params.model_name:
        params.model_name += "_3d"

    if not ':' in params.time:
        params.time = f"{int(params.time):02d}:00:00"

    arch = platform.uname().processor

    loaded_module = ''
    partition = params.partition
    # if partition is None:
    if arch == 'x86_64':
        partition = 'npl'
    elif arch == 'ppc64le':
        partition = 'dcs,rpi'

    if partition == 'npl':
        loaded_module = "module load gcc cuda openmpi"
    else:
        loaded_module = "module load spectrum-mpi"

    log_path = os.path.join(os.environ['HOME'], params.slurm_log_root)

    cluster = SlurmCluster(hyperparam_optimizer=params,
                           log_path=log_path,
                           python_cmd="python")

    # cluster.notify_job_status(email='',
    #                           on_fail=True,
    #                           on_done=False)
    # configure cluster
    cluster.per_experiment_nb_gpus = params.n_gpus
    cluster.per_experiment_nb_nodes = params.num_nodes
    cluster.per_experiment_nb_cpus = 0  # disable this option
    cluster.job_time = params.time
    cluster.minutes_to_checkpoint_before_walltime = 2  # 2 min walltime
    cluster.memory_mb_per_node = int(params.n_gpus) * int(
        params.cpus_per_task) * int(params.mem_per_cpu)

    if params.partition is not None:
        cluster.add_slurm_cmd('partition',
                              value=params.partition,
                              comment='cluster partition name')
    cluster.add_slurm_cmd('ntasks-per-node',
                          value=params.n_gpus,
                          comment='#task per node')
    cluster.add_slurm_cmd('cpus-per-task',
                          value=params.cpus_per_task,
                          comment='#cpu per task/gpu')
    cluster.add_slurm_cmd('mem-per-cpu',
                          value=params.mem_per_cpu,
                          comment="memory per cpu")

    # cluster.memory_mb_per_node = params.memory  # disable this option

    cluster.add_command('export PYTHONFAULTHANDLER=1')
    # cluster.add_command('export NCCL_DEBUG=INFO')
    cluster.add_command(loaded_module)

    # Master address for multi-node training
    cluster.add_command(
        "export SLURM_JOB_NODELIST=$(scontrol show hostnames $SLURM_JOB_NODELIST | tr '\\n' ' ')"
    )
    cluster.add_command("export SLURM_NODELIST=$SLURM_JOB_NODELIST")
    cluster.add_command(
        "slurm_nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST)")
    cluster.add_command(
        "export MASTER_ADDRESS=$(echo $slurm_nodes | cut -d' ' -f1)")

    if params.job_name is None:
        job_name = params.model_name
    else:
        job_name = params.job_name

    # Each hyperparameter combination will use 8 gpus.
    cluster.optimize_parallel_cluster_gpu(
        # Function to execute
        lambda par, _optimizer: fn_main(par, lt_system, _optimizer),
        # Number of hyperparameter combinations to search:
        nb_trials=params.nb_trials,
        enable_auto_resubmit=params.auto_resubmit,
        # This is what will display in the slurm queue:
        job_name=job_name)