Esempio n. 1
0
def make_batch_script(trainer_params, model_params, script_params):

    # Create LBANN objects
    trainer = lbann.Trainer(mini_batch_size=trainer_params.mini_batch_size)
    model = make_model(**model_params)
    reader = make_data_reader()

    # Optimizer with learning rate schedule
    # Note: Rough approximation of
    #   embed_dim^-0.5 * min(step^-0.5, step*warmup^-1.5)
    # with embed_dim=512 and warmup=4000.
    opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.98, eps=1e-9)
    model.callbacks.append(
        lbann.CallbackDropFixedLearningRate(
            drop_epoch=[1],
            amt=2,
        ))
    model.callbacks.append(
        lbann.CallbackDropFixedLearningRate(
            drop_epoch=[2, 4, 8, 12],
            amt=0.75,
        ))

    # Checkpoint after every epoch
    trainer.callbacks.append(
        lbann.CallbackCheckpoint(
            checkpoint_dir=os.path.join(script_params['work_dir'],
                                        'checkpoint'),
            checkpoint_epochs=1,
        ))

    # Dump weights after every epoch
    model.callbacks.append(
        lbann.CallbackDumpWeights(
            basename=os.path.join(script_params['work_dir'], 'weights'),
            epoch_interval=1,
        ))

    # Create Protobuf file
    protobuf_file = os.path.join(script_params['work_dir'],
                                 'experiment.prototext')
    lbann.proto.save_prototext(
        protobuf_file,
        trainer=trainer,
        model=model,
        data_reader=reader,
        optimizer=opt,
    )

    # Create batch script
    script = lbann.contrib.launcher.make_batch_script(**script_params, )
    script.add_command('echo "Started training at $(date)"')
    script.add_parallel_command([
        lbann.lbann_exe(),
        f'--prototext={protobuf_file}',
    ])
    script.add_command('status=$?')
    script.add_command('echo "Finished training at $(date)"')
    script.add_command('exit ${status}')
    return script
Esempio n. 2
0
def make_batch_script(trainer_params, model_params, script_params):

    #inference exe
    lbann_exe = abspath(lbann.lbann_exe())
    lbann_exe = join(dirname(lbann_exe), 'lbann_inf')

    # Create LBANN objects
    trainer = lbann.Trainer(mini_batch_size=trainer_params['mini_batch_size'])
    model = make_model(**model_params)
    # model.eval()
    reader = make_data_reader()

    # Optimizer with learning rate schedule
    # Note: Rough approximation of
    #   embed_dim^-0.5 * min(step^-0.5, step*warmup^-1.5)
    # with embed_dim=512 and warmup=4000.
    # opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.98, eps=1e-9)
    opt = lbann.NoOptimizer()
    model.callbacks.append(
        lbann.CallbackDropFixedLearningRate(
            drop_epoch=[1],
            amt=2,
        ))
    model.callbacks.append(
        lbann.CallbackDropFixedLearningRate(
            drop_epoch=[2, 4, 8, 12],
            amt=0.75,
        ))

    # Checkpoint after every epoch
    # trainer.callbacks.append(
    #     lbann.CallbackCheckpoint(
    #         checkpoint_dir=os.path.join(script_params['work_dir'], 'checkpoint'),
    #         checkpoint_epochs=1,
    #     )
    # )

    # Dump weights after every epoch
    # model.callbacks.append(
    #     lbann.CallbackDumpWeights(
    #         basename=os.path.join(script_params['work_dir'], 'weights'),
    #         epoch_interval=1,
    #     )
    # )

    status = lbann.contrib.launcher.run(
        trainer,
        model,
        reader,
        opt,
        lbann_exe,
        nodes=script_params['nodes'],
        procs_per_node=script_params['procs_per_node'],
        time_limit=30,
        setup_only=False,
        batch_job=False,
    )
    # **kwargs)

    print(status)
Esempio n. 3
0
def setup_experiment(lbann):
    """Construct LBANN experiment. 

    args: 
        lbann (module): Module for LBANN Python frontend
        
    """

    
    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
    
    model = make_model(NUM_NODES,
                      NUM_EDGES,
                      NUM_NODES_FEATURES,
                      NUM_EDGE_FEATURES,
                      EMBEDDING_DIM,
                      EDGE_EMBEDDING_DIM,
                      NUM_OUT_FEATURES,
                      num_epochs)
    reader = LSC_PPQM4M.make_data_reader("LSC_100K",
                                         validation_percent=0)
    

    optimizer = lbann.Adam(learn_rate=0.01, beta1=0.9, beta2=0.99, eps=1e-8 )
    return trainer, model, reader, optimizer
def setup_experiment(lbann):
    """Construct LBANN experiment. 

    args: 
        lbann (module): Module for LBANN Python frontend
        
    """

    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)

    callbacks = [
        lbann.CallbackPrint(),
        lbann.CallbackTimer(),
        lbann.CallbackGPUMemoryUsage()
    ]

    model = Sparse_Graph_Trainer.make_model(kernel_type='GatedGraph',
                                            num_epochs=num_epochs,
                                            callbacks=callbacks)
    reader = data.PROTEINS.make_data_reader()

    # No validation set

    optimizer = lbann.Adam(learn_rate=0.01, beta1=0.9, beta2=0.99, eps=1e-8)
    return trainer, model, reader, optimizer
Esempio n. 5
0
def setup_embeddings(script, config):

    # Get parameters
    num_vertices = config.getint('Graph', 'num_vertices')
    motif_size = config.getint('Motifs', 'motif_size')
    walk_length = config.getint('Walks', 'walk_length')
    embeddings_dir = config.get('Embeddings', 'embeddings_dir')
    embed_dim = config.getint('Embeddings', 'embed_dim')
    learn_rate = config.getfloat('Embeddings', 'learn_rate')
    mini_batch_size = config.getint('Embeddings', 'mini_batch_size')
    sgd_steps = config.getint('Embeddings', 'sgd_steps')
    sgd_steps_per_epoch = config.getint('Embeddings', 'sgd_steps_per_epoch')
    assert (num_vertices>0 and motif_size>0 and walk_length>=motif_size
            and embeddings_dir and embed_dim>0 and mini_batch_size>0
            and sgd_steps>=0 and sgd_steps_per_epoch>0), \
        'invalid configuration for training embeddings'

    # Construct LBANN objects
    num_epochs = (sgd_steps + sgd_steps_per_epoch - 1) // sgd_steps_per_epoch
    trainer = lbann.Trainer(
        mini_batch_size=mini_batch_size,
        num_parallel_readers=0,
    )
    model_ = make_model(
        motif_size,
        walk_length,
        num_vertices,
        embed_dim,
        learn_rate,
        num_epochs,
        embeddings_dir,
    )
    optimizer = lbann.SGD(learn_rate=learn_rate)
    data_reader = make_data_reader()

    # Add LBANN invocation to batch script
    prototext_file = os.path.join(script.work_dir, 'experiment.prototext')
    lbann.proto.save_prototext(
        prototext_file,
        trainer=trainer,
        model=model_,
        data_reader=data_reader,
        optimizer=optimizer,
    )
    script.add_body_line('')
    script.add_body_line('# Train embeddings')
    script.add_parallel_command([
        lbann.lbann_exe(),
        f'--prototext={prototext_file}',
        f'--num_io_threads=1',
    ])
Esempio n. 6
0
def set_up_experiment(args, input_, probs, labels):
    # Set up objective function
    cross_entropy = lbann.CrossEntropy([probs, labels])
    layers = list(lbann.traverse_layer_graph(input_))
    l2_reg_weights = set()
    for l in layers:
        if type(l) == lbann.Convolution or type(l) == lbann.FullyConnected:
            l2_reg_weights.update(l.weights)
    # scale = weight decay
    l2_reg = lbann.L2WeightRegularization(weights=l2_reg_weights, scale=1e-4)
    objective_function = lbann.ObjectiveFunction([cross_entropy, l2_reg])

    # Set up model
    top1 = lbann.CategoricalAccuracy([probs, labels])
    top5 = lbann.TopKCategoricalAccuracy([probs, labels], k=5)
    metrics = [
        lbann.Metric(top1, name='top-1 accuracy', unit='%'),
        lbann.Metric(top5, name='top-5 accuracy', unit='%')
    ]
    callbacks = [
        lbann.CallbackPrint(),
        lbann.CallbackTimer(),
        lbann.CallbackDropFixedLearningRate(drop_epoch=[30, 60], amt=0.1)
    ]
    model = lbann.Model(args.num_epochs,
                        layers=layers,
                        objective_function=objective_function,
                        metrics=metrics,
                        callbacks=callbacks)

    # Set up data reader
    data_reader = data.imagenet.make_data_reader(num_classes=args.num_classes)

    # Set up optimizer
    if args.optimizer == 'sgd':
        print('Creating sgd optimizer')
        optimizer = lbann.optimizer.SGD(
            learn_rate=args.optimizer_learning_rate,
            momentum=0.9,
            nesterov=True)
    else:
        optimizer = lbann.contrib.args.create_optimizer(args)

    # Setup trainer
    trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size)

    return trainer, model, data_reader, optimizer
Esempio n. 7
0
                        metrics=metrics,
                        callbacks=callbacks)

    # Setup optimizer
    optimizer = lbann.contrib.args.create_optimizer(args)

    # Setup data reader
    data_reader = create_cosmoflow_data_reader(args.train_dir,
                                               args.val_dir,
                                               args.test_dir,
                                               num_responses=args.num_secrets)

    # Setup trainer
    random_seed_arg = {'random_seed': args.random_seed} \
        if args.random_seed is not None else {}
    trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size,
                            **random_seed_arg)

    # Runtime parameters/arguments
    environment = lbann.contrib.args.get_distconv_environment(
        num_io_partitions=args.depth_groups)
    if args.dynamically_reclaim_error_signals:
        environment['LBANN_KEEP_ERROR_SIGNALS'] = 0
    else:
        environment['LBANN_KEEP_ERROR_SIGNALS'] = 1
    lbann_args = ['--use_data_store']

    # Run experiment
    kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
    lbann.contrib.launcher.run(trainer,
                               model,
                               data_reader,
Esempio n. 8
0
def main():
    run_args = construct_lc_launcher_args()

    # add data_config data
    # and do not overwrite args if data_reader_prototext is enabled
    if os.path.isfile(
            run_args.data_config) and not run_args.data_reader_prototext:
        with open(run_args.data_config, "r") as f:
            config = json.load(f)
        for k, v in config.items():
            setattr(run_args, k, v)

    trainer = lbann.Trainer(run_args.batch_size,
                            #name=None,
                            )

    # define data_reader
    if run_args.data_reader_prototext:
        print("Using data_reader_prototext")
        assert run_args.sequence_length is not None
        assert run_args.vocab is not None

        data_reader_proto = lbann.lbann_pb2.LbannPB()
        with open(run_args.data_reader_prototext, "r") as f:
            txtf.Merge(f.read(), data_reader_proto)
        data_reader = data_reader_proto.data_reader
    else:
        data_reader = construct_data_reader(run_args)

    if "LBANN_EXPERIMENT_DIR" in os.environ:
        work_dir = os.environ["LBANN_EXPERIMENT_DIR"]
    else:
        work_dir = os.path.join(os.getcwd())
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    experiment_dir = os.path.join(work_dir,
                                  "{}_{}".format(timestamp, run_args.job_name))
    if not os.path.exists(experiment_dir):
        os.makedirs(experiment_dir)

    # model and optimizer
    model = construct_model(run_args)
    opt = lbann.Adam(learn_rate=run_args.lr, beta1=0.9, beta2=0.99, eps=1e-8)

    # dump the config to the experiment_dir so that it can be used to load the model in pytorch (moses codebase)
    ppn = 4 if run_args.scheduler == "lsf" else 2
    print("args:\n" + str(run_args))
    if (run_args.scheduler == 'slurm'):
        import torch
        torch.save(run_args, "{}/{}_config.pt".format(experiment_dir,
                                                      run_args.job_name))

    m_lbann_args = f"--load_model_weights_dir_is_complete --load_model_weights_dir={run_args.dump_model_dir} --vocab={run_args.vocab} --num_samples={run_args.num_samples} --sequence_length={run_args.sequence_length}  --num_io_threads={run_args.num_io_threads} --no_header={run_args.no_header} --delimiter={run_args.delimiter}"
    if (run_args.data_reader_prototext):
        m_lbann_args = " ".join(
            (m_lbann_args, " --use_data_store --preload_data_store "))
    if (run_args.procs_per_trainer):
        m_lbann_args = " ".join(
            (m_lbann_args,
             f"--procs_per_trainer={run_args.procs_per_trainer}"))

    status = lbann.contrib.launcher.run(
        trainer,
        model,
        data_reader,
        opt,
        lbann_exe,
        partition=run_args.partition,
        scheduler=run_args.scheduler,
        account=run_args.account,
        time_limit=run_args.time_limit,
        nodes=run_args.nodes,
        procs_per_node=ppn,
        batch_job=True,
        #setup_only = True,
        job_name=run_args.job_name,
        experiment_dir=experiment_dir,
        lbann_args=m_lbann_args,
        #turn on for tensor core
        environment={
            'LBANN_USE_CUBLAS_TENSOR_OPS': 1,
            'LBANN_USE_CUDNN_TENSOR_OPS': 1,
        },
    )

    print("LBANN launcher status:\n" + str(status))
Esempio n. 9
0
num_epochs = 20
model = lbann.Model(num_epochs,
                    layers=lbann.traverse_layer_graph(input_),
                    objective_function=loss,
                    metrics=[lbann.Metric(acc, name='accuracy', unit='%')],
                    callbacks=[
                        lbann.CallbackPrintModelDescription(),
                        lbann.CallbackPrint(),
                        lbann.CallbackTimer()
                    ])

# Setup optimizer
opt = lbann.SGD(learn_rate=0.01, momentum=0.9)

# Setup data reader
data_reader = data.mnist.make_data_reader()

# Setup trainer
trainer = lbann.Trainer(mini_batch_size=mini_batch_size)

# ----------------------------------
# Run experiment
# ----------------------------------
kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
lbann.contrib.launcher.run(trainer,
                           model,
                           data_reader,
                           opt,
                           job_name=args.job_name,
                           **kwargs)
Esempio n. 10
0
                    objective_function=obj,
                    metrics=metrics,
                    callbacks=callbacks,
                    summary_dir=".")

# Setup optimizer
opt = lbann.contrib.args.create_optimizer(args)

# Setup data reader
num_classes = min(args.num_classes, num_labels)

if dataset == "cifar10":
    data_reader = cifar10.make_data_reader(num_classes=num_classes)
else:
    data_reader = imagenet.make_data_reader(num_classes=num_classes)

# Setup trainer
trainer = lbann.Trainer(random_seed=args.random_seed,
                        mini_batch_size=args.mini_batch_size)

# Run experiment
kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
kwargs['lbann_args'] = '--data_reader_percent=' + str(args.data_reader_percent)

lbann.contrib.launcher.run(trainer,
                           model,
                           data_reader,
                           opt,
                           job_name=args.job_name,
                           **kwargs)
Esempio n. 11
0
    print("Random seed",random_seed)
    if mcr: print("Using Multi-channel rescaling")
    ### Create prefix for foldername
    now=datetime.now()
    fldr_name=now.strftime('%Y%m%d_%H%M%S') ## time format
    
    data_pct,val_ratio=1.0,0.1 # Percentage of data to use, % of data for validation
    batchsize=args.batchsize
    step_interval=args.step_interval
    
    print('Step interval',step_interval)
    work_dir="/global/cscratch1/sd/vpa/proj/cosmogan/results_dir/512square/{0}_bsize{1}_{2}".format(fldr_name,batchsize,args.suffix) 
    
    #####################
    ### Run lbann
    trainer = lbann.Trainer(mini_batch_size=batchsize,random_seed=random_seed,callbacks=lbann.CallbackCheckpoint(checkpoint_dir='chkpt', 
  checkpoint_epochs=10))  
#     checkpoint_steps=step_interval))
    
    spectral_loss=args.spec_loss
    print("Spectral loss: ",spectral_loss)
    model = construct_model(num_epochs,mcr,spectral_loss=spectral_loss,save_batch_interval=int(step_interval)) #'step_interval*val_ratio' is the step interval for validation set.
    # Setup optimizer
    opt = lbann.Adam(learn_rate=args.learn_rate,beta1=0.5,beta2=0.99,eps=1e-8)
    # Load data reader from prototext
    data_reader = construct_data_reader(data_pct,val_ratio)
    
    status = lbann.run(trainer,model, data_reader, opt,
                       nodes=num_nodes, procs_per_node=num_procs, 
                       work_dir=work_dir,
                       scheduler='slurm', time_limit=1440, setup_only=False)
    
Esempio n. 12
0
# ==============================================
# Setup and launch experiment
# ==============================================

# Default data reader
model_zoo_dir = dirname(dirname(abspath(__file__)))
data_reader_prototext = join(model_zoo_dir, 'data', 'jag_100Kdata.prototext')

if __name__ == '__main__':
    import lbann

    y_dim = 16399  #image+scalar shape
    z_dim = 20  #Latent space dim
    num_epochs = 100
    mini_batch_size = 128
    trainer = lbann.Trainer(mini_batch_size=mini_batch_size, serialize_io=True)
    model = jag_models.construct_jag_wae_model(y_dim=y_dim,
                                               z_dim=z_dim,
                                               num_epochs=num_epochs)
    # Setup optimizer
    opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.99, eps=1e-8)
    # Load data reader from prototext
    data_reader_proto = lbann.lbann_pb2.LbannPB()
    with open(data_reader_prototext, 'r') as f:
        txtf.Merge(f.read(), data_reader_proto)
    data_reader_proto = data_reader_proto.data_reader

    status = lbann.run(trainer,
                       model,
                       data_reader_proto,
                       opt,
Esempio n. 13
0
metrics = [lbann.Metric(obj, name="loss")]

model = lbann.Model(
    lbann_params.epochs,
    layers=lbann.traverse_layer_graph(input_),
    objective_function=obj,
    metrics=metrics,
    callbacks=[
        lbann.CallbackPrint(),
        lbann.CallbackTimer(),
    ],
)

# Setup trainer, optimizer, data_reader
trainer = lbann.Trainer(
    mini_batch_size=lbann_params.mini_batch_size,
    num_parallel_readers=1,
)
optimizer = lbann.Adam(
    learn_rate=0.01,
    beta1=0.9,
    beta2=0.99,
    eps=1e-8,
)
data_reader = make_data_reader()

# Launch LBANN
kwargs = lbann.contrib.args.get_scheduler_kwargs(lbann_params)
kwargs["environment"] = {}
lbann.contrib.launcher.run(
    trainer,
    model,
Esempio n. 14
0
    fldr_name = now.strftime('%Y%m%d_%H%M%S')  ## time format

    data_pct, val_ratio = 1.0, 0.1  # Percentage of data to use, % of data for validation
    batchsize = gdict['batchsize']

    work_dir = gdict['op_loc'] + "{0}_bsize{1}_{2}".format(
        fldr_name, batchsize, gdict['run_suffix'])
    print(work_dir)
    print(gdict)

    #####################
    ### Run lbann
    trainer = lbann.Trainer(
        mini_batch_size=batchsize,
        random_seed=random_seed,
        callbacks=lbann.CallbackCheckpoint(
            checkpoint_dir='chkpt',
            #   checkpoint_epochs=10))
            checkpoint_steps=gdict['step_interval']))

    spectral_loss = gdict['lambda_spec']
    if spectral_loss: print("Using Spectral loss with coupling", spectral_loss)

    model = construct_model(
        num_epochs,
        gdict['mcr'],
        spectral_loss=spectral_loss,
        save_batch_interval=int(gdict['step_interval'])
    )  #'step_interval*val_ratio' is the step interval for validation set.
    # Setup optimizer
    opt = lbann.Adam(learn_rate=gdict['learn_rate'],
Esempio n. 15
0
    print('Save interval', save_interval)

    ### Create foldername inside initial folder
    now = datetime.now()
    d1 = now.strftime('%Y%m%d_%H%M%S')  ## time format
    strg = args.pretrained_dir
    top_dir = strg.split('chkpt')[0]  # parent directory
    suffix = strg.split('chkpt')[-1].split('/')[2].split('training')[-1][
        1:]  # Adding epoch-step info as folder suffix
    work_dir = top_dir + 'gen_imgs_chkpt/{0}_{1}'.format(suffix, d1)
    print("Generating images at ", work_dir)
    #     os.makedirs(work_dir)

    #####################
    ### Run lbann
    trainer = lbann.Trainer(mini_batch_size=batchsize, random_seed=random_seed)
    model = construct_model(num_epochs, mcr, save_batch_interval=save_interval)
    # Setup optimizer
    opt = lbann.Adam(learn_rate=0.0002, beta1=0.5, beta2=0.99, eps=1e-8)
    # Load data reader from prototext
    data_reader = construct_data_reader(data_pct, val_ratio)

    ### Initialize LBANN inf executable
    lbann_exe = abspath(lbann.lbann_exe())
    lbann_exe = join(dirname(lbann_exe), 'lbann_inf')

    print('Loading model from :', args.pretrained_dir)
    status = lbann.run(
        trainer,
        model,
        data_reader,
Esempio n. 16
0
File: main.py Progetto: oyamay/lbann
# Initialize LBANN executable and command-line arguments
lbann_exe = os.path.realpath(lbann.lbann_exe())
lbann_exe = os.path.join(os.path.dirname(lbann_exe), 'lbann2')
lbann_command = [lbann_exe]

# Construct experiment directory
experiment_dir = util.make_experiment_dir(args.job_name)

# Export model prototext files
# Note: lbann2 driver doesn't have a command-line argument to get
# trainer.
file1 = os.path.join(experiment_dir, 'model1.prototext')
file2 = os.path.join(experiment_dir, 'model2.prototext')
lbann.proto.save_prototext(file1,
                           model=model1,
                           trainer=lbann.Trainer(mini_batch_size=512))
lbann.proto.save_prototext(file2,
                           model=model2,
                           trainer=lbann.Trainer(mini_batch_size=512))
lbann_command.append(f'--model={{{file1},{file2}}}')

# Export data reader prototext files
file1 = os.path.join(experiment_dir, 'reader1.prototext')
file2 = os.path.join(experiment_dir, 'reader2.prototext')
lbann.proto.save_prototext(file1, data_reader=reader1)
lbann.proto.save_prototext(file2, data_reader=reader2)
lbann_command.append(f'--reader={{{file1},{file2}}}')

# Export optimizer prototext files
file1 = os.path.join(experiment_dir, 'opt1.prototext')
file2 = os.path.join(experiment_dir, 'opt2.prototext')
Esempio n. 17
0
                               low_score_wins=True,
                               exchange_hyperparameters=True))
    # Construct model
    return lbann.Model(args.num_epochs,
                       weights=weights,
                       serialize_io=True,
                       layers=layers,
                       metrics=metrics,
                       objective_function=obj,
                       callbacks=callbacks)


if __name__ == '__main__':
    import lbann

    trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size,
                            procs_per_trainer=args.procs_per_trainer)
    model = construct_model()
    # Setup optimizer
    opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.99, eps=1e-8)
    # Load data reader from prototext
    data_reader_proto = lbann.lbann_pb2.LbannPB()
    with open(data_reader_prototext, 'r') as f:
        txtf.Merge(f.read(), data_reader_proto)
    data_reader_proto = data_reader_proto.data_reader

    kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
    status = lbann.contrib.launcher.run(
        trainer,
        model,
        data_reader_proto,
        opt,
Esempio n. 18
0
callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()]

layers = list(lbann.traverse_layer_graph([images, responses]))
model = lbann.Model(args.num_epochs,
                    layers=layers,
                    metrics=metrics,
                    objective_function=mse,
                    callbacks=callbacks)

# Load data reader from prototext
data_reader_proto = lbann.lbann_pb2.LbannPB()
with open(data_reader_prototext, 'r') as f:
    txtf.Merge(f.read(), data_reader_proto)
data_reader_proto = data_reader_proto.data_reader

# Setup trainer
trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size)

# Setup optimizer
opt = lbann.Adam(learn_rate=0.0002, beta1=0.9, beta2=0.99, eps=1e-8)

# Run experiment
kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
lbann.contrib.launcher.run(trainer,
                           model,
                           data_reader_proto,
                           opt,
                           lbann_args=" --use_data_store --preload_data_store",
                           job_name=args.job_name,
                           **kwargs)
Esempio n. 19
0
# ----------------------------------
# Set up Experiment
# ----------------------------------

#Generate Model
model = lbann.Model(
    num_epochs,
    layers=layers,
    objective_function=img_loss,
    metrics=metrics,
    callbacks=[print_model, training_output, gpu_usage, encoded_output])

#Optimizer

opt = lbann.Adam(learn_rate=1e-2, beta1=0.9, beta2=0.99, eps=1e-8)

data_reader = MOFae.make_data_reader()

#Trainer

trainer = lbann.Trainer(mini_batch_size=mini_batch_size, name="MOF_AE_1")

# ----------------------------------
# Run Experiment
# ----------------------------------

kwargs = lbann.contrib.args.get_scheduler_kwargs(args)

lbann.contrib.launcher.run(trainer, model, data_reader, opt, **kwargs)
Esempio n. 20
0
    l.datatype = lbann.DataType.DOUBLE
    for w in l.weights:
        w.datatype = lbann.DataType.DOUBLE

# ----------------------------------
# Run LBANN
# ----------------------------------

# Create optimizer
opt = lbann.SGD(learn_rate=args.learning_rate)

# Create LBANN objects
iterations_per_epoch = utils.ceildiv(epoch_size, args.mini_batch_size)
num_epochs = utils.ceildiv(args.num_iterations, iterations_per_epoch)
trainer = lbann.Trainer(
    mini_batch_size=args.mini_batch_size,
    num_parallel_readers=0,
)
callbacks = [
    lbann.CallbackPrint(),
    lbann.CallbackTimer(),
    lbann.CallbackDumpWeights(
        directory='embeddings',
        epoch_interval=num_epochs,
        format='distributed_binary',
    ),
]
model = lbann.Model(
    num_epochs,
    layers=lbann.traverse_layer_graph(input_),
    objective_function=obj,
    metrics=metrics,
Esempio n. 21
0
        kfac_args["update_interval_steps"] = args.kfac_update_interval_steps
    kfac_args["compute_interval"] = args.kfac_compute_interval_steps
    algo = lbann.KFAC("kfac", algo, **kfac_args)

# Setup model
model = lbann.Model(args.num_epochs,
                    layers=lbann.traverse_layer_graph(input_),
                    objective_function=obj,
                    metrics=metrics,
                    callbacks=callbacks)

# Setup optimizer
opt = lbann.contrib.args.create_optimizer(args)

# Setup trainer
trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size,
                        training_algo=algo)

# Setup environment variables
environment = {"LBANN_KEEP_ERROR_SIGNALS": 1}

# ----------------------------------
# Run experiment
# ----------------------------------
kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
lbann.contrib.launcher.run(trainer,
                           model,
                           data_reader,
                           opt,
                           job_name=args.job_name,
                           environment=environment,
                           batch_job=args.batch_job,