Example #1
0
def make_batch_script(trainer_params, model_params, script_params):

    # Create LBANN objects
    trainer = lbann.Trainer(mini_batch_size=trainer_params.mini_batch_size)
    model = make_model(**model_params)
    reader = make_data_reader()

    # Optimizer with learning rate schedule
    # Note: Rough approximation of
    #   embed_dim^-0.5 * min(step^-0.5, step*warmup^-1.5)
    # with embed_dim=512 and warmup=4000.
    opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.98, eps=1e-9)
    model.callbacks.append(
        lbann.CallbackDropFixedLearningRate(
            drop_epoch=[1],
            amt=2,
        ))
    model.callbacks.append(
        lbann.CallbackDropFixedLearningRate(
            drop_epoch=[2, 4, 8, 12],
            amt=0.75,
        ))

    # Checkpoint after every epoch
    trainer.callbacks.append(
        lbann.CallbackCheckpoint(
            checkpoint_dir=os.path.join(script_params['work_dir'],
                                        'checkpoint'),
            checkpoint_epochs=1,
        ))

    # Dump weights after every epoch
    model.callbacks.append(
        lbann.CallbackDumpWeights(
            basename=os.path.join(script_params['work_dir'], 'weights'),
            epoch_interval=1,
        ))

    # Create Protobuf file
    protobuf_file = os.path.join(script_params['work_dir'],
                                 'experiment.prototext')
    lbann.proto.save_prototext(
        protobuf_file,
        trainer=trainer,
        model=model,
        data_reader=reader,
        optimizer=opt,
    )

    # Create batch script
    script = lbann.contrib.launcher.make_batch_script(**script_params, )
    script.add_command('echo "Started training at $(date)"')
    script.add_parallel_command([
        lbann.lbann_exe(),
        f'--prototext={protobuf_file}',
    ])
    script.add_command('status=$?')
    script.add_command('echo "Finished training at $(date)"')
    script.add_command('exit ${status}')
    return script
Example #2
0
def make_batch_script(trainer_params, model_params, script_params):

    #inference exe
    lbann_exe = abspath(lbann.lbann_exe())
    lbann_exe = join(dirname(lbann_exe), 'lbann_inf')

    # Create LBANN objects
    trainer = lbann.Trainer(mini_batch_size=trainer_params['mini_batch_size'])
    model = make_model(**model_params)
    # model.eval()
    reader = make_data_reader()

    # Optimizer with learning rate schedule
    # Note: Rough approximation of
    #   embed_dim^-0.5 * min(step^-0.5, step*warmup^-1.5)
    # with embed_dim=512 and warmup=4000.
    # opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.98, eps=1e-9)
    opt = lbann.NoOptimizer()
    model.callbacks.append(
        lbann.CallbackDropFixedLearningRate(
            drop_epoch=[1],
            amt=2,
        ))
    model.callbacks.append(
        lbann.CallbackDropFixedLearningRate(
            drop_epoch=[2, 4, 8, 12],
            amt=0.75,
        ))

    # Checkpoint after every epoch
    # trainer.callbacks.append(
    #     lbann.CallbackCheckpoint(
    #         checkpoint_dir=os.path.join(script_params['work_dir'], 'checkpoint'),
    #         checkpoint_epochs=1,
    #     )
    # )

    # Dump weights after every epoch
    # model.callbacks.append(
    #     lbann.CallbackDumpWeights(
    #         basename=os.path.join(script_params['work_dir'], 'weights'),
    #         epoch_interval=1,
    #     )
    # )

    status = lbann.contrib.launcher.run(
        trainer,
        model,
        reader,
        opt,
        lbann_exe,
        nodes=script_params['nodes'],
        procs_per_node=script_params['procs_per_node'],
        time_limit=30,
        setup_only=False,
        batch_job=False,
    )
    # **kwargs)

    print(status)
Example #3
0
def run(
    trainer,
    model,
    data_reader,
    optimizer,
    lbann_exe=lbann.lbann_exe(),
    lbann_args=[],
    overwrite_script=False,
    setup_only=False,
    batch_job=False,
    *args,
    **kwargs,
):
    """Run LBANN with system-specific optimizations.

    This is intended to match the behavior of `lbann.run`, with
    defaults and optimizations for the current system. See that
    function for a full list of options.

    """

    # Create batch script generator
    script = make_batch_script(*args, **kwargs)

    # Batch script prints start time
    script.add_command('echo "Started at $(date)"')

    # Batch script invokes LBANN
    lbann_command = [lbann_exe]
    lbann_command.extend(make_iterable(lbann_args))
    prototext_file = os.path.join(script.work_dir, 'experiment.prototext')
    lbann.proto.save_prototext(prototext_file,
                               trainer=trainer,
                               model=model,
                               data_reader=data_reader,
                               optimizer=optimizer)
    lbann_command.append('--prototext={}'.format(prototext_file))
    script.add_parallel_command(lbann_command)
    script.add_command('status=$?')

    # Batch script prints finish time and returns status
    script.add_command('echo "Finished at $(date)"')
    script.add_command('exit ${status}')

    # Write, run, or submit batch script
    status = 0
    if setup_only:
        script.write(overwrite=overwrite_script)
    elif batch_job:
        status = script.submit(overwrite=overwrite_script)
    else:
        status = script.run(overwrite=overwrite_script)
    return status
Example #4
0
def setup_embeddings(script, config):

    # Get parameters
    num_vertices = config.getint('Graph', 'num_vertices')
    motif_size = config.getint('Motifs', 'motif_size')
    walk_length = config.getint('Walks', 'walk_length')
    embeddings_dir = config.get('Embeddings', 'embeddings_dir')
    embed_dim = config.getint('Embeddings', 'embed_dim')
    learn_rate = config.getfloat('Embeddings', 'learn_rate')
    mini_batch_size = config.getint('Embeddings', 'mini_batch_size')
    sgd_steps = config.getint('Embeddings', 'sgd_steps')
    sgd_steps_per_epoch = config.getint('Embeddings', 'sgd_steps_per_epoch')
    assert (num_vertices>0 and motif_size>0 and walk_length>=motif_size
            and embeddings_dir and embed_dim>0 and mini_batch_size>0
            and sgd_steps>=0 and sgd_steps_per_epoch>0), \
        'invalid configuration for training embeddings'

    # Construct LBANN objects
    num_epochs = (sgd_steps + sgd_steps_per_epoch - 1) // sgd_steps_per_epoch
    trainer = lbann.Trainer(
        mini_batch_size=mini_batch_size,
        num_parallel_readers=0,
    )
    model_ = make_model(
        motif_size,
        walk_length,
        num_vertices,
        embed_dim,
        learn_rate,
        num_epochs,
        embeddings_dir,
    )
    optimizer = lbann.SGD(learn_rate=learn_rate)
    data_reader = make_data_reader()

    # Add LBANN invocation to batch script
    prototext_file = os.path.join(script.work_dir, 'experiment.prototext')
    lbann.proto.save_prototext(
        prototext_file,
        trainer=trainer,
        model=model_,
        data_reader=data_reader,
        optimizer=optimizer,
    )
    script.add_body_line('')
    script.add_body_line('# Train embeddings')
    script.add_parallel_command([
        lbann.lbann_exe(),
        f'--prototext={prototext_file}',
        f'--num_io_threads=1',
    ])
Example #5
0
        'ingest_edge_list',
    )
    script.add_parallel_command([
        ingest_graph_exe,
        f'-o {distributed_graph_file}',
        f'-d {2**30}',
        '-u 1',
        graph_file,
    ])

# LBANN invocation
prototext_file = os.path.join(script.work_dir, 'experiment.prototext')
lbann.proto.save_prototext(
    prototext_file,
    trainer=trainer,
    model=model,
    data_reader=reader,
    optimizer=opt,
)
script.add_parallel_command([
    lbann.lbann_exe(),
    f'--prototext={prototext_file}',
    f'--num_io_threads=1',
])

# Run LBANN
if args.batch_job:
    script.submit(True)
else:
    script.run(True)
Example #6
0
from google.protobuf import text_format as txtf
import json
import numpy as np
import models.wae as molwae

import lbann
import lbann.contrib.launcher
import lbann.modules
from lbann.util import str_list


def list2str(l):
    return ' '.join(l)


lbann_exe = abspath(lbann.lbann_exe())
lbann_exe = join(dirname(lbann_exe), 'lbann_inf')


def construct_lc_launcher_args():

    # defaults correspond to the settings needed for training on the moses dataset
    parser = argparse.ArgumentParser(prog="lbann ATOM VAE training")
    parser.add_argument("--partition", default=None)
    parser.add_argument("--account", default="hpcdl")
    parser.add_argument("--scheduler", type=str, default="slurm")
    parser.add_argument(
        "--data-module-file",
        default="dataset.py",
        help="specifies the module that contains the logic for loading data",
    )
Example #7
0
def run(
    trainer,
    model,
    data_reader,
    optimizer,
    work_dir=None,
    proto_file_name='experiment.prototext',
    nodes=1,
    procs_per_node=1,
    time_limit=None,
    scheduler=None,
    job_name='lbann',
    partition=None,
    account=None,
    reservation=None,
    launcher_args=[],
    lbann_exe=lbann.lbann_exe(),
    lbann_args=[],
    procs_per_trainer=None,
    environment={},
    overwrite_script=False,
    setup_only=False,
    batch_job=False,
    nvprof=False,
    nvprof_output_name=None,
    experiment_dir=None,
):
    """Run LBANN.

    This is intended to interface with job schedulers on HPC
    clusters. It will either submit a batch job (if on a login node)
    or run with an existing node allocation (if on a compute
    node). Behavior may vary across schedulers.

    If an experiment directory is not provided, a timestamped
    directory is created (by default in the current working
    directory). The location of autogenerated experiment directories
    can be set with the environment variable `LBANN_EXPERIMENT_DIR`.

    Args:
        trainer (lbann.Trainer): LBANN trainer.
        model (lbann.Model): Neural network model.
        data_reader (lbann.reader_pb2.DataReader): Data reader.
        optimizer (lbann.model.Optimizer): Default optimizer for
            model.
        work_dir (str, optional): Working directory.
        nodes (int, optional): Number of compute nodes.
        procs_per_node (int, optional): Number of processes per compute
            node.
        time_limit (int, optional): Job time limit, in minutes.
        scheduler (str, optional): Job scheduler.
        job_name (str, optional): Batch job name.
        partition (str, optional): Scheduler partition.
        account (str, optional): Scheduler account.
        reservation (str, optional): Scheduler reservation name.
        launcher_args (str, optional): Command-line arguments to
            launcher.
        lbann_exe (str, optional): LBANN executable.
        lbann_args (str, optional): Command-line arguments to LBANN
            executable.
        procs_per_trainer (int, optional): Number of processes per
            LBANN trainer. Default is all processes in one trainer.
        environment (dict of {str: str}, optional): Environment
            variables.
        overwrite_script (bool, optional): Whether to overwrite script
            file if it already exists.
        setup_only (bool, optional): If true, the experiment is not
            run after the experiment directory is initialized.
        batch_job (bool, optional): If true, the experiment is
            submitted to the scheduler as a batch job.
        nvprof (bool, optional): If true, an nvprof command is added
            to the beginning of LBANN executable.
        nvprof_output_name (str, optional): nvprof output filename.
            Filename should be unique to each process by using %q{ENV}
            (see https://docs.nvidia.com/cuda/profiler-users-guide/).
        experiment_dir (str, optional, deprecated): See `work_dir`.

    Returns:
        int: Exit status.

    """

    # Create batch script generator
    if not work_dir:
        work_dir = experiment_dir
    script = make_batch_script(work_dir=work_dir,
                               nodes=nodes,
                               procs_per_node=procs_per_node,
                               time_limit=time_limit,
                               scheduler=scheduler,
                               job_name=job_name,
                               partition=partition,
                               account=account,
                               reservation=reservation,
                               launcher_args=launcher_args,
                               environment=environment)

    # Batch script prints start time
    script.add_command('echo "Started at $(date)"')

    # Batch script invokes LBANN
    lbann_command = [lbann_exe]
    if nvprof:
        lbann_command = nvprof_command(
            work_dir=work_dir, output_name=nvprof_output_name) + lbann_command
    lbann_command.extend(make_iterable(lbann_args))
    prototext_file = os.path.join(script.work_dir, proto_file_name)
    lbann.proto.save_prototext(prototext_file,
                               trainer=trainer,
                               model=model,
                               data_reader=data_reader,
                               optimizer=optimizer)
    lbann_command.append('--prototext={}'.format(prototext_file))
    if procs_per_trainer is not None:
        lbann_command.append(f'--procs_per_trainer={procs_per_trainer}')

    script.add_parallel_command(lbann_command)
    script.add_command('status=$?')

    # Batch script prints finish time and returns status
    script.add_command('echo "Finished at $(date)"')
    script.add_command('exit ${status}')

    # Write, submit, or run batch script
    status = 0
    if setup_only:
        script.write(overwrite=overwrite_script)
    elif batch_job:
        status = script.submit(overwrite=overwrite_script)
    else:
        status = script.run(overwrite=overwrite_script)
    return status
Example #8
0
def run(model, data_reader, optimizer,
        lbann_exe=lbann_exe(),
        lbann_args='',
        experiment_dir=None,
        nodes=1,
        procs_per_node=procs_per_node(),
        time_limit=60,
        scheduler=scheduler(),
        job_name='lbann',
        system=system(),
        partition=partition(),
        account=account(),
        reservation=None,
        launcher_args='',
        environment={},
        setup_only=False):
    """Run LBANN experiment with LC-specific optimizations.

    This is a convenience wrapper around the `lbann.launcher.run`
    function, with defaults and optimizations for LC systems.

    """

    # Setup GPU bindings
    # Note: Hydrogen processes take ownership of the GPU indices that
    # matches their node communicator ranks. mpibind assigns each rank
    # a unique GPU with index 0, so it should be disabled. Processes
    # may touch the wrong GPUs in the process of figuring out GPU
    # ownership, so an exclusive GPU compute mode causes problems.
    if scheduler == 'slurm' and has_gpu(system):
        launcher_args += ' --mpibind=off --nvidia_compute_mode=default'

    # Deal with Pascal's strange hardware topology
    # Note: Both GPUs on a Pascal node are on the same socket, so we
    # only use cores on that socket.
    if system == 'pascal' and procs_per_node == 2:
        if scheduler == 'slurm':
            launcher_args += ' --cpu_bind=mask_cpu:0x000001ff,0x0003fe00'
        environment['OMP_NUM_THREADS'] = 8
        environment['AL_PROGRESS_RANKS_PER_NUMA_NODE'] = 2

    # Hacked bugfix for MPI_Init in MVAPICH2-2.3
    # Note: MPI_Init hangs when started with more than 35
    # processes. This bug is not present in MVAPICH2-2.2 but is
    # present in MVAPICH2-2.3rc2.
    environment['MV2_USE_RDMA_CM'] = 0

    # Hacked bugfix for MPI_Sendrecv in MVAPICH2-2.3
    # Note: MPI_Sendrecv produces incorrect output under certain
    # circumstances. This bug is not present in MVAPICH2-2.2 or
    # MVAPICH2-2.3.1.
    environment['MV2_USE_LAZY_MEM_UNREGISTER'] = 0

    # Magic default arguments to jsrun/etc.
    # Note: Pack processes using ten cores for each, with 40 cores total, and
    # all four GPUs visible to each process.
    if system in ('sierra', 'lassen'):
        if scheduler == 'lsf':
            launcher_args += ' -d packed -b "packed:10" -r 1 -c 40 -g 4'
        environment['OMP_NUM_THREADS'] = 4
        # Deal with topology mis-identification on Sierra/Lassen.
        environment['AL_PROGRESS_RANKS_PER_NUMA_NODE'] = 2

    # Run LBANN
    lbann.launcher.run(model, data_reader, optimizer,
                       lbann_exe = lbann_exe,
                       lbann_args = lbann_args,
                       experiment_dir = experiment_dir,
                       nodes = nodes,
                       procs_per_node = procs_per_node,
                       time_limit = time_limit,
                       scheduler = scheduler,
                       job_name = job_name,
                       system = system,
                       partition = partition,
                       account = account,
                       launcher_args = launcher_args,
                       environment = environment,
                       setup_only = setup_only)
Example #9
0
def run(model, data_reader, optimizer,
        lbann_exe=lbann.lbann_exe(),
        lbann_args='',
        experiment_dir=None,
        nodes=1,
        procs_per_node=1,
        time_limit=60,
        scheduler='slurm',
        job_name='lbann',
        system=None,
        partition=None,
        account=None,
        reservation=None,
        launcher_args='',
        environment={},
        setup_only=False):
    """Run LBANN experiment.

    This is intended to interface with job schedulers on HPC
    clusters. It will either submit a batch job (if on a login node)
    or run with an existing node allocation (if on a compute
    node). Behavior may vary across schedulers.

    If an experiment directory is not provided, a timestamped
    directory is created (by default in the current working
    directory). The location of autogenerated experiment directories
    can be set with the environment variable `LBANN_EXPERIMENT_DIR`.

    Args:
        model (lbann.model.Model or lbann_pb2.Model): Neural network
            model.
        data_reader (lbann_pb2.DataReader): Data reader.
        optimizer (lbann.model.Model or lbann_pb2.Optimizer): Default
            optimizer for model.
        lbann_exe (str, optional): LBANN executable.
        lbann_args (str, optional): Command-line arguments to LBANN
            executable.
        experiment_dir (str, optional): Experiment directory.
        nodes (int, optional): Number of compute nodes.
        procs_per_node (int, optional): Number of processes per compute
            node.
        time_limit (int, optional): Job time limit, in minutes.
        scheduler (str, optional): Job scheduler.
        job_name (str, optional): Batch job name.
        system (str, optional): Target system.
        partition (str, optional): Scheduler partition.
        account (str, optional): Scheduler account.
        reservation (str, optional): Scheduler reservation name.
        launcher_args (str, optional): Command-line arguments to
            launcher.
        environment (dict of {str: str}, optional): Environment
            variables.
        setup_only (bool, optional): If true, the experiment is not
            run after the experiment directory is initialized.

    """

    # Construct experiment directory if needed
    if not experiment_dir:
        if 'LBANN_EXPERIMENT_DIR' in os.environ:
            experiment_dir = os.environ['LBANN_EXPERIMENT_DIR']
        else:
            experiment_dir = os.path.join(os.getcwd())
        timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
        experiment_dir = os.path.join(experiment_dir,
                                      '{}_{}'.format(timestamp, job_name))
        i = 1
        while os.path.lexists(experiment_dir):
            i += 1
            experiment_dir = os.path.join(
                os.path.dirname(experiment_dir),
                '{}_{}_{}'.format(timestamp, job_name, i))
    experiment_dir = os.path.abspath(experiment_dir)
    os.makedirs(experiment_dir, exist_ok=True)

    # Create experiment prototext file
    prototext_file = os.path.join(experiment_dir, 'experiment.prototext')
    lbann.proto.save_prototext(prototext_file,
                               model = model,
                               data_reader = data_reader,
                               optimizer = optimizer)
    lbann_args += ' --prototext=' + prototext_file

    # Run experiment
    if scheduler.lower() in ('slurm', 'srun', 'sbatch'):
        slurm.run(experiment_dir=experiment_dir,
                  command='{} {}'.format(lbann_exe, lbann_args),
                  nodes=nodes,
                  procs_per_node=procs_per_node,
                  time_limit=time_limit,
                  job_name=job_name,
                  partition=partition,
                  account=account,
                  reservation=reservation,
                  srun_args=launcher_args,
                  environment=environment,
                  setup_only=setup_only)
    elif scheduler.lower() in ('lsf', 'jsrun', 'bsub'):
        lsf.run(experiment_dir=experiment_dir,
                command='{} {}'.format(lbann_exe, lbann_args),
                nodes=nodes,
                procs_per_node=procs_per_node,
                time_limit=time_limit,
                job_name=job_name,
                partition=partition,
                account=account,
                reservation=reservation,
                jsrun_args=launcher_args,
                environment=environment,
                setup_only=setup_only)
    else:
        raise RuntimeError('unsupported job scheduler ({})'
                           .format(scheduler))
Example #10
0
File: main.py Project: oyamay/lbann
    name='finetune',
    num_labels=200,
    mini_batch_size=128,
    num_epochs=500,
    learning_rate=0.1,
    warmup_epochs=50,
    learning_rate_drop_interval=50,
    learning_rate_drop_factor=0.25,
)

# ==============================================
# Construct LBANN invocation
# ==============================================

# Initialize LBANN executable and command-line arguments
lbann_exe = os.path.realpath(lbann.lbann_exe())
lbann_exe = os.path.join(os.path.dirname(lbann_exe), 'lbann2')
lbann_command = [lbann_exe]

# Construct experiment directory
experiment_dir = util.make_experiment_dir(args.job_name)

# Export model prototext files
# Note: lbann2 driver doesn't have a command-line argument to get
# trainer.
file1 = os.path.join(experiment_dir, 'model1.prototext')
file2 = os.path.join(experiment_dir, 'model2.prototext')
lbann.proto.save_prototext(file1,
                           model=model1,
                           trainer=lbann.Trainer(mini_batch_size=512))
lbann.proto.save_prototext(file2,