Example #1
0
def make_batch_script(trainer_params, model_params, script_params):

    # Create LBANN objects
    trainer = lbann.Trainer(mini_batch_size=trainer_params.mini_batch_size)
    model = make_model(**model_params)
    reader = make_data_reader()

    # Optimizer with learning rate schedule
    # Note: Rough approximation of
    #   embed_dim^-0.5 * min(step^-0.5, step*warmup^-1.5)
    # with embed_dim=512 and warmup=4000.
    opt = lbann.Adam(learn_rate=0.0001, beta1=0.9, beta2=0.98, eps=1e-9)
    model.callbacks.append(
        lbann.CallbackDropFixedLearningRate(
            drop_epoch=[1],
            amt=2,
        ))
    model.callbacks.append(
        lbann.CallbackDropFixedLearningRate(
            drop_epoch=[2, 4, 8, 12],
            amt=0.75,
        ))

    # Checkpoint after every epoch
    trainer.callbacks.append(
        lbann.CallbackCheckpoint(
            checkpoint_dir=os.path.join(script_params['work_dir'],
                                        'checkpoint'),
            checkpoint_epochs=1,
        ))

    # Dump weights after every epoch
    model.callbacks.append(
        lbann.CallbackDumpWeights(
            basename=os.path.join(script_params['work_dir'], 'weights'),
            epoch_interval=1,
        ))

    # Create Protobuf file
    protobuf_file = os.path.join(script_params['work_dir'],
                                 'experiment.prototext')
    lbann.proto.save_prototext(
        protobuf_file,
        trainer=trainer,
        model=model,
        data_reader=reader,
        optimizer=opt,
    )

    # Create batch script
    script = lbann.contrib.launcher.make_batch_script(**script_params, )
    script.add_command('echo "Started training at $(date)"')
    script.add_parallel_command([
        lbann.lbann_exe(),
        f'--prototext={protobuf_file}',
    ])
    script.add_command('status=$?')
    script.add_command('echo "Finished training at $(date)"')
    script.add_command('exit ${status}')
    return script
def setup_experiment(lbann):
    """Construct LBANN experiment. 

    args: 
        lbann (module): Module for LBANN Python frontend
        
    """

    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)

    callbacks = [
        lbann.CallbackPrint(),
        lbann.CallbackTimer(),
        lbann.CallbackGPUMemoryUsage()
    ]

    model = Sparse_Graph_Trainer.make_model(kernel_type='GatedGraph',
                                            num_epochs=num_epochs,
                                            callbacks=callbacks)
    reader = data.PROTEINS.make_data_reader()

    # No validation set

    optimizer = lbann.Adam(learn_rate=0.01, beta1=0.9, beta2=0.99, eps=1e-8)
    return trainer, model, reader, optimizer
def setup_experiment(lbann):
    """Construct LBANN experiment. 

    args: 
        lbann (module): Module for LBANN Python frontend
        
    """

    
    trainer = lbann.Trainer(mini_batch_size=mini_batch_size)
    
    model = make_model(NUM_NODES,
                      NUM_EDGES,
                      NUM_NODES_FEATURES,
                      NUM_EDGE_FEATURES,
                      EMBEDDING_DIM,
                      EDGE_EMBEDDING_DIM,
                      NUM_OUT_FEATURES,
                      num_epochs)
    reader = LSC_PPQM4M.make_data_reader("LSC_100K",
                                         validation_percent=0)
    

    optimizer = lbann.Adam(learn_rate=0.01, beta1=0.9, beta2=0.99, eps=1e-8 )
    return trainer, model, reader, optimizer
Example #4
0
    fldr_name=now.strftime('%Y%m%d_%H%M%S') ## time format
    
    data_pct,val_ratio=1.0,0.1 # Percentage of data to use, % of data for validation
    batchsize=args.batchsize
    step_interval=args.step_interval
    
    print('Step interval',step_interval)
    work_dir="/global/cscratch1/sd/vpa/proj/cosmogan/results_dir/512square/{0}_bsize{1}_{2}".format(fldr_name,batchsize,args.suffix) 
    
    #####################
    ### Run lbann
    trainer = lbann.Trainer(mini_batch_size=batchsize,random_seed=random_seed,callbacks=lbann.CallbackCheckpoint(checkpoint_dir='chkpt', 
  checkpoint_epochs=10))  
#     checkpoint_steps=step_interval))
    
    spectral_loss=args.spec_loss
    print("Spectral loss: ",spectral_loss)
    model = construct_model(num_epochs,mcr,spectral_loss=spectral_loss,save_batch_interval=int(step_interval)) #'step_interval*val_ratio' is the step interval for validation set.
    # Setup optimizer
    opt = lbann.Adam(learn_rate=args.learn_rate,beta1=0.5,beta2=0.99,eps=1e-8)
    # Load data reader from prototext
    data_reader = construct_data_reader(data_pct,val_ratio)
    
    status = lbann.run(trainer,model, data_reader, opt,
                       nodes=num_nodes, procs_per_node=num_procs, 
                       work_dir=work_dir,
                       scheduler='slurm', time_limit=1440, setup_only=False)
    
    print(status)

Example #5
0
callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()]

layers = list(lbann.traverse_layer_graph([images, responses]))
model = lbann.Model(args.num_epochs,
                    layers=layers,
                    metrics=metrics,
                    objective_function=mse,
                    callbacks=callbacks)

# Load data reader from prototext
data_reader_proto = lbann.lbann_pb2.LbannPB()
with open(data_reader_prototext, 'r') as f:
    txtf.Merge(f.read(), data_reader_proto)
data_reader_proto = data_reader_proto.data_reader

# Setup trainer
trainer = lbann.Trainer(mini_batch_size=args.mini_batch_size)

# Setup optimizer
opt = lbann.Adam(learn_rate=0.0002, beta1=0.9, beta2=0.99, eps=1e-8)

# Run experiment
kwargs = lbann.contrib.args.get_scheduler_kwargs(args)
lbann.contrib.launcher.run(trainer,
                           model,
                           data_reader_proto,
                           opt,
                           lbann_args=" --use_data_store --preload_data_store",
                           job_name=args.job_name,
                           **kwargs)
Example #6
0
def main():
    run_args = construct_lc_launcher_args()

    # add data_config data
    # and do not overwrite args if data_reader_prototext is enabled
    if os.path.isfile(
            run_args.data_config) and not run_args.data_reader_prototext:
        with open(run_args.data_config, "r") as f:
            config = json.load(f)
        for k, v in config.items():
            setattr(run_args, k, v)

    trainer = lbann.Trainer(run_args.batch_size,
                            #name=None,
                            )

    # define data_reader
    if run_args.data_reader_prototext:
        print("Using data_reader_prototext")
        assert run_args.sequence_length is not None
        assert run_args.vocab is not None

        data_reader_proto = lbann.lbann_pb2.LbannPB()
        with open(run_args.data_reader_prototext, "r") as f:
            txtf.Merge(f.read(), data_reader_proto)
        data_reader = data_reader_proto.data_reader
    else:
        data_reader = construct_data_reader(run_args)

    if "LBANN_EXPERIMENT_DIR" in os.environ:
        work_dir = os.environ["LBANN_EXPERIMENT_DIR"]
    else:
        work_dir = os.path.join(os.getcwd())
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    experiment_dir = os.path.join(work_dir,
                                  "{}_{}".format(timestamp, run_args.job_name))
    if not os.path.exists(experiment_dir):
        os.makedirs(experiment_dir)

    # model and optimizer
    model = construct_model(run_args)
    opt = lbann.Adam(learn_rate=run_args.lr, beta1=0.9, beta2=0.99, eps=1e-8)

    # dump the config to the experiment_dir so that it can be used to load the model in pytorch (moses codebase)
    ppn = 4 if run_args.scheduler == "lsf" else 2
    print("args:\n" + str(run_args))
    if (run_args.scheduler == 'slurm'):
        import torch
        torch.save(run_args, "{}/{}_config.pt".format(experiment_dir,
                                                      run_args.job_name))

    m_lbann_args = f"--load_model_weights_dir_is_complete --load_model_weights_dir={run_args.dump_model_dir} --vocab={run_args.vocab} --num_samples={run_args.num_samples} --sequence_length={run_args.sequence_length}  --num_io_threads={run_args.num_io_threads} --no_header={run_args.no_header} --delimiter={run_args.delimiter}"
    if (run_args.data_reader_prototext):
        m_lbann_args = " ".join(
            (m_lbann_args, " --use_data_store --preload_data_store "))
    if (run_args.procs_per_trainer):
        m_lbann_args = " ".join(
            (m_lbann_args,
             f"--procs_per_trainer={run_args.procs_per_trainer}"))

    status = lbann.contrib.launcher.run(
        trainer,
        model,
        data_reader,
        opt,
        lbann_exe,
        partition=run_args.partition,
        scheduler=run_args.scheduler,
        account=run_args.account,
        time_limit=run_args.time_limit,
        nodes=run_args.nodes,
        procs_per_node=ppn,
        batch_job=True,
        #setup_only = True,
        job_name=run_args.job_name,
        experiment_dir=experiment_dir,
        lbann_args=m_lbann_args,
        #turn on for tensor core
        environment={
            'LBANN_USE_CUBLAS_TENSOR_OPS': 1,
            'LBANN_USE_CUDNN_TENSOR_OPS': 1,
        },
    )

    print("LBANN launcher status:\n" + str(status))
Example #7
0
def create_unet3d_optimizer(learn_rate):
    # TODO: This is a temporal optimizer copied from CosomoFlow.
    adam = lbann.Adam(learn_rate=learn_rate, beta1=0.9, beta2=0.999, eps=1e-8)
    return adam
Example #8
0
    objective_function=obj,
    metrics=metrics,
    callbacks=[
        lbann.CallbackPrint(),
        lbann.CallbackTimer(),
    ],
)

# Setup trainer, optimizer, data_reader
trainer = lbann.Trainer(
    mini_batch_size=lbann_params.mini_batch_size,
    num_parallel_readers=1,
)
optimizer = lbann.Adam(
    learn_rate=0.01,
    beta1=0.9,
    beta2=0.99,
    eps=1e-8,
)
data_reader = make_data_reader()

# Launch LBANN
kwargs = lbann.contrib.args.get_scheduler_kwargs(lbann_params)
kwargs["environment"] = {}
lbann.contrib.launcher.run(
    trainer,
    model,
    data_reader,
    optimizer,
    work_dir=lbann_params.work_dir,
    job_name=lbann_params.job_name,
    lbann_args=["--num_io_threads=1"],
Example #9
0
            checkpoint_dir='chkpt',
            #   checkpoint_epochs=10))
            checkpoint_steps=gdict['step_interval']))

    spectral_loss = gdict['lambda_spec']
    if spectral_loss: print("Using Spectral loss with coupling", spectral_loss)

    model = construct_model(
        num_epochs,
        gdict['mcr'],
        spectral_loss=spectral_loss,
        save_batch_interval=int(gdict['step_interval'])
    )  #'step_interval*val_ratio' is the step interval for validation set.
    # Setup optimizer
    opt = lbann.Adam(learn_rate=gdict['learn_rate'],
                     beta1=gdict['beta1'],
                     beta2=gdict['beta2'],
                     eps=float(gdict['eps']))
    # Load data reader from prototext
    data_reader = construct_data_reader(data_pct, val_ratio)

    status = lbann.run(trainer,
                       model,
                       data_reader,
                       opt,
                       nodes=num_nodes,
                       procs_per_node=num_procs,
                       work_dir=work_dir,
                       scheduler='slurm',
                       time_limit=1440,
                       setup_only=False)