def make_batch_script(trainer_params, model_params, script_params): # Create LBANN objects trainer = lbann.Trainer(mini_batch_size=trainer_params.mini_batch_size) model = make_model(**model_params) reader = make_data_reader() # Optimizer with learning rate schedule # Note: Rough approximation of # embed_dim^-0.5 * min(step^-0.5, step*warmup^-1.5) # with embed_dim=512 and warmup=4000. opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.98, eps=1e-9) model.callbacks.append( lbann.CallbackDropFixedLearningRate( drop_epoch=[1], amt=2, )) model.callbacks.append( lbann.CallbackDropFixedLearningRate( drop_epoch=[2, 4, 8, 12], amt=0.75, )) # Checkpoint after every epoch trainer.callbacks.append( lbann.CallbackCheckpoint( checkpoint_dir=os.path.join(script_params['work_dir'], 'checkpoint'), checkpoint_epochs=1, )) # Dump weights after every epoch model.callbacks.append( lbann.CallbackDumpWeights( basename=os.path.join(script_params['work_dir'], 'weights'), epoch_interval=1, )) # Create Protobuf file protobuf_file = os.path.join(script_params['work_dir'], 'experiment.prototext') lbann.proto.save_prototext( protobuf_file, trainer=trainer, model=model, data_reader=reader, optimizer=opt, ) # Create batch script script = lbann.contrib.launcher.make_batch_script(**script_params, ) script.add_command('echo "Started training at $(date)"') script.add_parallel_command([ lbann.lbann_exe(), f'--prototext={protobuf_file}', ]) script.add_command('status=$?') script.add_command('echo "Finished training at $(date)"') script.add_command('exit ${status}') return script
def make_batch_script(trainer_params, model_params, script_params): #inference exe lbann_exe = abspath(lbann.lbann_exe()) lbann_exe = join(dirname(lbann_exe), 'lbann_inf') # Create LBANN objects trainer = lbann.Trainer(mini_batch_size=trainer_params['mini_batch_size']) model = make_model(**model_params) # model.eval() reader = make_data_reader() # Optimizer with learning rate schedule # Note: Rough approximation of # embed_dim^-0.5 * min(step^-0.5, step*warmup^-1.5) # with embed_dim=512 and warmup=4000. # opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.98, eps=1e-9) opt = lbann.NoOptimizer() model.callbacks.append( lbann.CallbackDropFixedLearningRate( drop_epoch=[1], amt=2, )) model.callbacks.append( lbann.CallbackDropFixedLearningRate( drop_epoch=[2, 4, 8, 12], amt=0.75, )) # Checkpoint after every epoch # trainer.callbacks.append( # lbann.CallbackCheckpoint( # checkpoint_dir=os.path.join(script_params['work_dir'], 'checkpoint'), # checkpoint_epochs=1, # ) # ) # Dump weights after every epoch # model.callbacks.append( # lbann.CallbackDumpWeights( # basename=os.path.join(script_params['work_dir'], 'weights'), # epoch_interval=1, # ) # ) status = lbann.contrib.launcher.run( trainer, model, reader, opt, lbann_exe, nodes=script_params['nodes'], procs_per_node=script_params['procs_per_node'], time_limit=30, setup_only=False, batch_job=False, ) # **kwargs) print(status)
def run( trainer, model, data_reader, optimizer, lbann_exe=lbann.lbann_exe(), lbann_args=[], overwrite_script=False, setup_only=False, batch_job=False, *args, **kwargs, ): """Run LBANN with system-specific optimizations. This is intended to match the behavior of `lbann.run`, with defaults and optimizations for the current system. See that function for a full list of options. """ # Create batch script generator script = make_batch_script(*args, **kwargs) # Batch script prints start time script.add_command('echo "Started at $(date)"') # Batch script invokes LBANN lbann_command = [lbann_exe] lbann_command.extend(make_iterable(lbann_args)) prototext_file = os.path.join(script.work_dir, 'experiment.prototext') lbann.proto.save_prototext(prototext_file, trainer=trainer, model=model, data_reader=data_reader, optimizer=optimizer) lbann_command.append('--prototext={}'.format(prototext_file)) script.add_parallel_command(lbann_command) script.add_command('status=$?') # Batch script prints finish time and returns status script.add_command('echo "Finished at $(date)"') script.add_command('exit ${status}') # Write, run, or submit batch script status = 0 if setup_only: script.write(overwrite=overwrite_script) elif batch_job: status = script.submit(overwrite=overwrite_script) else: status = script.run(overwrite=overwrite_script) return status
def setup_embeddings(script, config): # Get parameters num_vertices = config.getint('Graph', 'num_vertices') motif_size = config.getint('Motifs', 'motif_size') walk_length = config.getint('Walks', 'walk_length') embeddings_dir = config.get('Embeddings', 'embeddings_dir') embed_dim = config.getint('Embeddings', 'embed_dim') learn_rate = config.getfloat('Embeddings', 'learn_rate') mini_batch_size = config.getint('Embeddings', 'mini_batch_size') sgd_steps = config.getint('Embeddings', 'sgd_steps') sgd_steps_per_epoch = config.getint('Embeddings', 'sgd_steps_per_epoch') assert (num_vertices>0 and motif_size>0 and walk_length>=motif_size and embeddings_dir and embed_dim>0 and mini_batch_size>0 and sgd_steps>=0 and sgd_steps_per_epoch>0), \ 'invalid configuration for training embeddings' # Construct LBANN objects num_epochs = (sgd_steps + sgd_steps_per_epoch - 1) // sgd_steps_per_epoch trainer = lbann.Trainer( mini_batch_size=mini_batch_size, num_parallel_readers=0, ) model_ = make_model( motif_size, walk_length, num_vertices, embed_dim, learn_rate, num_epochs, embeddings_dir, ) optimizer = lbann.SGD(learn_rate=learn_rate) data_reader = make_data_reader() # Add LBANN invocation to batch script prototext_file = os.path.join(script.work_dir, 'experiment.prototext') lbann.proto.save_prototext( prototext_file, trainer=trainer, model=model_, data_reader=data_reader, optimizer=optimizer, ) script.add_body_line('') script.add_body_line('# Train embeddings') script.add_parallel_command([ lbann.lbann_exe(), f'--prototext={prototext_file}', f'--num_io_threads=1', ])
'ingest_edge_list', ) script.add_parallel_command([ ingest_graph_exe, f'-o {distributed_graph_file}', f'-d {2**30}', '-u 1', graph_file, ]) # LBANN invocation prototext_file = os.path.join(script.work_dir, 'experiment.prototext') lbann.proto.save_prototext( prototext_file, trainer=trainer, model=model, data_reader=reader, optimizer=opt, ) script.add_parallel_command([ lbann.lbann_exe(), f'--prototext={prototext_file}', f'--num_io_threads=1', ]) # Run LBANN if args.batch_job: script.submit(True) else: script.run(True)
from google.protobuf import text_format as txtf import json import numpy as np import models.wae as molwae import lbann import lbann.contrib.launcher import lbann.modules from lbann.util import str_list def list2str(l): return ' '.join(l) lbann_exe = abspath(lbann.lbann_exe()) lbann_exe = join(dirname(lbann_exe), 'lbann_inf') def construct_lc_launcher_args(): # defaults correspond to the settings needed for training on the moses dataset parser = argparse.ArgumentParser(prog="lbann ATOM VAE training") parser.add_argument("--partition", default=None) parser.add_argument("--account", default="hpcdl") parser.add_argument("--scheduler", type=str, default="slurm") parser.add_argument( "--data-module-file", default="dataset.py", help="specifies the module that contains the logic for loading data", )
def run( trainer, model, data_reader, optimizer, work_dir=None, proto_file_name='experiment.prototext', nodes=1, procs_per_node=1, time_limit=None, scheduler=None, job_name='lbann', partition=None, account=None, reservation=None, launcher_args=[], lbann_exe=lbann.lbann_exe(), lbann_args=[], procs_per_trainer=None, environment={}, overwrite_script=False, setup_only=False, batch_job=False, nvprof=False, nvprof_output_name=None, experiment_dir=None, ): """Run LBANN. This is intended to interface with job schedulers on HPC clusters. It will either submit a batch job (if on a login node) or run with an existing node allocation (if on a compute node). Behavior may vary across schedulers. If an experiment directory is not provided, a timestamped directory is created (by default in the current working directory). The location of autogenerated experiment directories can be set with the environment variable `LBANN_EXPERIMENT_DIR`. Args: trainer (lbann.Trainer): LBANN trainer. model (lbann.Model): Neural network model. data_reader (lbann.reader_pb2.DataReader): Data reader. optimizer (lbann.model.Optimizer): Default optimizer for model. work_dir (str, optional): Working directory. nodes (int, optional): Number of compute nodes. procs_per_node (int, optional): Number of processes per compute node. time_limit (int, optional): Job time limit, in minutes. scheduler (str, optional): Job scheduler. job_name (str, optional): Batch job name. partition (str, optional): Scheduler partition. account (str, optional): Scheduler account. reservation (str, optional): Scheduler reservation name. launcher_args (str, optional): Command-line arguments to launcher. lbann_exe (str, optional): LBANN executable. lbann_args (str, optional): Command-line arguments to LBANN executable. procs_per_trainer (int, optional): Number of processes per LBANN trainer. Default is all processes in one trainer. environment (dict of {str: str}, optional): Environment variables. overwrite_script (bool, optional): Whether to overwrite script file if it already exists. setup_only (bool, optional): If true, the experiment is not run after the experiment directory is initialized. batch_job (bool, optional): If true, the experiment is submitted to the scheduler as a batch job. nvprof (bool, optional): If true, an nvprof command is added to the beginning of LBANN executable. nvprof_output_name (str, optional): nvprof output filename. Filename should be unique to each process by using %q{ENV} (see https://docs.nvidia.com/cuda/profiler-users-guide/). experiment_dir (str, optional, deprecated): See `work_dir`. Returns: int: Exit status. """ # Create batch script generator if not work_dir: work_dir = experiment_dir script = make_batch_script(work_dir=work_dir, nodes=nodes, procs_per_node=procs_per_node, time_limit=time_limit, scheduler=scheduler, job_name=job_name, partition=partition, account=account, reservation=reservation, launcher_args=launcher_args, environment=environment) # Batch script prints start time script.add_command('echo "Started at $(date)"') # Batch script invokes LBANN lbann_command = [lbann_exe] if nvprof: lbann_command = nvprof_command( work_dir=work_dir, output_name=nvprof_output_name) + lbann_command lbann_command.extend(make_iterable(lbann_args)) prototext_file = os.path.join(script.work_dir, proto_file_name) lbann.proto.save_prototext(prototext_file, trainer=trainer, model=model, data_reader=data_reader, optimizer=optimizer) lbann_command.append('--prototext={}'.format(prototext_file)) if procs_per_trainer is not None: lbann_command.append(f'--procs_per_trainer={procs_per_trainer}') script.add_parallel_command(lbann_command) script.add_command('status=$?') # Batch script prints finish time and returns status script.add_command('echo "Finished at $(date)"') script.add_command('exit ${status}') # Write, submit, or run batch script status = 0 if setup_only: script.write(overwrite=overwrite_script) elif batch_job: status = script.submit(overwrite=overwrite_script) else: status = script.run(overwrite=overwrite_script) return status
def run(model, data_reader, optimizer, lbann_exe=lbann_exe(), lbann_args='', experiment_dir=None, nodes=1, procs_per_node=procs_per_node(), time_limit=60, scheduler=scheduler(), job_name='lbann', system=system(), partition=partition(), account=account(), reservation=None, launcher_args='', environment={}, setup_only=False): """Run LBANN experiment with LC-specific optimizations. This is a convenience wrapper around the `lbann.launcher.run` function, with defaults and optimizations for LC systems. """ # Setup GPU bindings # Note: Hydrogen processes take ownership of the GPU indices that # matches their node communicator ranks. mpibind assigns each rank # a unique GPU with index 0, so it should be disabled. Processes # may touch the wrong GPUs in the process of figuring out GPU # ownership, so an exclusive GPU compute mode causes problems. if scheduler == 'slurm' and has_gpu(system): launcher_args += ' --mpibind=off --nvidia_compute_mode=default' # Deal with Pascal's strange hardware topology # Note: Both GPUs on a Pascal node are on the same socket, so we # only use cores on that socket. if system == 'pascal' and procs_per_node == 2: if scheduler == 'slurm': launcher_args += ' --cpu_bind=mask_cpu:0x000001ff,0x0003fe00' environment['OMP_NUM_THREADS'] = 8 environment['AL_PROGRESS_RANKS_PER_NUMA_NODE'] = 2 # Hacked bugfix for MPI_Init in MVAPICH2-2.3 # Note: MPI_Init hangs when started with more than 35 # processes. This bug is not present in MVAPICH2-2.2 but is # present in MVAPICH2-2.3rc2. environment['MV2_USE_RDMA_CM'] = 0 # Hacked bugfix for MPI_Sendrecv in MVAPICH2-2.3 # Note: MPI_Sendrecv produces incorrect output under certain # circumstances. This bug is not present in MVAPICH2-2.2 or # MVAPICH2-2.3.1. environment['MV2_USE_LAZY_MEM_UNREGISTER'] = 0 # Magic default arguments to jsrun/etc. # Note: Pack processes using ten cores for each, with 40 cores total, and # all four GPUs visible to each process. if system in ('sierra', 'lassen'): if scheduler == 'lsf': launcher_args += ' -d packed -b "packed:10" -r 1 -c 40 -g 4' environment['OMP_NUM_THREADS'] = 4 # Deal with topology mis-identification on Sierra/Lassen. environment['AL_PROGRESS_RANKS_PER_NUMA_NODE'] = 2 # Run LBANN lbann.launcher.run(model, data_reader, optimizer, lbann_exe = lbann_exe, lbann_args = lbann_args, experiment_dir = experiment_dir, nodes = nodes, procs_per_node = procs_per_node, time_limit = time_limit, scheduler = scheduler, job_name = job_name, system = system, partition = partition, account = account, launcher_args = launcher_args, environment = environment, setup_only = setup_only)
def run(model, data_reader, optimizer, lbann_exe=lbann.lbann_exe(), lbann_args='', experiment_dir=None, nodes=1, procs_per_node=1, time_limit=60, scheduler='slurm', job_name='lbann', system=None, partition=None, account=None, reservation=None, launcher_args='', environment={}, setup_only=False): """Run LBANN experiment. This is intended to interface with job schedulers on HPC clusters. It will either submit a batch job (if on a login node) or run with an existing node allocation (if on a compute node). Behavior may vary across schedulers. If an experiment directory is not provided, a timestamped directory is created (by default in the current working directory). The location of autogenerated experiment directories can be set with the environment variable `LBANN_EXPERIMENT_DIR`. Args: model (lbann.model.Model or lbann_pb2.Model): Neural network model. data_reader (lbann_pb2.DataReader): Data reader. optimizer (lbann.model.Model or lbann_pb2.Optimizer): Default optimizer for model. lbann_exe (str, optional): LBANN executable. lbann_args (str, optional): Command-line arguments to LBANN executable. experiment_dir (str, optional): Experiment directory. nodes (int, optional): Number of compute nodes. procs_per_node (int, optional): Number of processes per compute node. time_limit (int, optional): Job time limit, in minutes. scheduler (str, optional): Job scheduler. job_name (str, optional): Batch job name. system (str, optional): Target system. partition (str, optional): Scheduler partition. account (str, optional): Scheduler account. reservation (str, optional): Scheduler reservation name. launcher_args (str, optional): Command-line arguments to launcher. environment (dict of {str: str}, optional): Environment variables. setup_only (bool, optional): If true, the experiment is not run after the experiment directory is initialized. """ # Construct experiment directory if needed if not experiment_dir: if 'LBANN_EXPERIMENT_DIR' in os.environ: experiment_dir = os.environ['LBANN_EXPERIMENT_DIR'] else: experiment_dir = os.path.join(os.getcwd()) timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') experiment_dir = os.path.join(experiment_dir, '{}_{}'.format(timestamp, job_name)) i = 1 while os.path.lexists(experiment_dir): i += 1 experiment_dir = os.path.join( os.path.dirname(experiment_dir), '{}_{}_{}'.format(timestamp, job_name, i)) experiment_dir = os.path.abspath(experiment_dir) os.makedirs(experiment_dir, exist_ok=True) # Create experiment prototext file prototext_file = os.path.join(experiment_dir, 'experiment.prototext') lbann.proto.save_prototext(prototext_file, model = model, data_reader = data_reader, optimizer = optimizer) lbann_args += ' --prototext=' + prototext_file # Run experiment if scheduler.lower() in ('slurm', 'srun', 'sbatch'): slurm.run(experiment_dir=experiment_dir, command='{} {}'.format(lbann_exe, lbann_args), nodes=nodes, procs_per_node=procs_per_node, time_limit=time_limit, job_name=job_name, partition=partition, account=account, reservation=reservation, srun_args=launcher_args, environment=environment, setup_only=setup_only) elif scheduler.lower() in ('lsf', 'jsrun', 'bsub'): lsf.run(experiment_dir=experiment_dir, command='{} {}'.format(lbann_exe, lbann_args), nodes=nodes, procs_per_node=procs_per_node, time_limit=time_limit, job_name=job_name, partition=partition, account=account, reservation=reservation, jsrun_args=launcher_args, environment=environment, setup_only=setup_only) else: raise RuntimeError('unsupported job scheduler ({})' .format(scheduler))
name='finetune', num_labels=200, mini_batch_size=128, num_epochs=500, learning_rate=0.1, warmup_epochs=50, learning_rate_drop_interval=50, learning_rate_drop_factor=0.25, ) # ============================================== # Construct LBANN invocation # ============================================== # Initialize LBANN executable and command-line arguments lbann_exe = os.path.realpath(lbann.lbann_exe()) lbann_exe = os.path.join(os.path.dirname(lbann_exe), 'lbann2') lbann_command = [lbann_exe] # Construct experiment directory experiment_dir = util.make_experiment_dir(args.job_name) # Export model prototext files # Note: lbann2 driver doesn't have a command-line argument to get # trainer. file1 = os.path.join(experiment_dir, 'model1.prototext') file2 = os.path.join(experiment_dir, 'model2.prototext') lbann.proto.save_prototext(file1, model=model1, trainer=lbann.Trainer(mini_batch_size=512)) lbann.proto.save_prototext(file2,